From abccb2fa4fdb073436e61d44ac3e13affded0d0b Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 14 Sep 2016 15:19:40 -0400 Subject: [PATCH 01/25] add back train method but mark as deprecated --- .../src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index d2febf61ad34..dd1416c8459a 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -173,7 +173,7 @@ object XGBoost extends Serializable { require(tracker.start(trackerConf.workerConnectionTimeout), "FAULT: Failed to start tracker") tracker } - + /** * @return A tuple of the booster and the metrics used to build training summary */ From 34346c90a84feef54bb7ec7c2d5f38fb27bb1c9c Mon Sep 17 00:00:00 2001 From: CodingCat Date: Mon, 10 Oct 2016 15:41:18 -0400 Subject: [PATCH 02/25] fix scalastyle error --- .../src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index dd1416c8459a..d2febf61ad34 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -173,7 +173,7 @@ object XGBoost extends Serializable { require(tracker.start(trackerConf.workerConnectionTimeout), "FAULT: Failed to start tracker") tracker } - + /** * @return A tuple of the booster and the metrics used to build training summary */ From 9a77997ff76585875db63855bc40dfe73282584e Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 14 Sep 2016 15:19:40 -0400 Subject: [PATCH 03/25] add back train method but mark as deprecated --- .../src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index d2febf61ad34..dd1416c8459a 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -173,7 +173,7 @@ object XGBoost extends Serializable { require(tracker.start(trackerConf.workerConnectionTimeout), "FAULT: Failed to start tracker") tracker } - + /** * @return A tuple of the booster and the metrics used to build training summary */ From 45f9dba03f3cebde0bafb57fb6ae7672d70609b3 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Mon, 10 Oct 2016 15:41:18 -0400 Subject: [PATCH 04/25] fix scalastyle error --- .../src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala index dd1416c8459a..d2febf61ad34 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala @@ -173,7 +173,7 @@ object XGBoost extends Serializable { require(tracker.start(trackerConf.workerConnectionTimeout), "FAULT: Failed to start tracker") tracker } - + /** * @return A tuple of the booster and the metrics used to build training summary */ From f3e4eb4e18c478ef80d17a718c93002f087b22e8 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Wed, 25 Jul 2018 22:11:32 -0700 Subject: [PATCH 05/25] add new --- jvm-packages/xgboost4j-spark/docs/index.md | 138 +++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 jvm-packages/xgboost4j-spark/docs/index.md diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/jvm-packages/xgboost4j-spark/docs/index.md new file mode 100644 index 000000000000..4ed4092186ed --- /dev/null +++ b/jvm-packages/xgboost4j-spark/docs/index.md @@ -0,0 +1,138 @@ +# XGBoost4J-Spark + +(3 - 4 sentences intro) + +# Build an Application with XGBoost4J-Spark + +(based on maven build), step by step (structure of program) + +## Data Preparation + +As aforementioned, XGBoost4J-Spark seamlessly integrates Spark and XGBoost. The integration enables + users to apply various types of transformation over the training/test datasets with the convenient + and powerful data processing framework, Spark. + +In this section, we use [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset as an example to + showcase how we use Spark to transform raw dataset and make it fit the requirement of XGBoost. + +Iris dataset is shipped in CSV format. Each instance contains 4 features, "sepal length", "sepal width", +"petal length" and "petal width". "class" column in each instance is +essentially the label which has three distinct values: "Iris Setosa", "Iris Versicolour" +and "Iris Virginica". + +### Read Dataset with Spark Built-In CSV Reader + +The first thing in data transformation is to load the dataset as Spark's structured data abstraction, +DataFrame. + +```scala + import org.apache.spark.sql.SparkSession + import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} + + val spark = SparkSession.builder().getOrCreate() + val schema = new StructType(Array( + StructField("sepal length", DoubleType, true), + StructField("sepal width", DoubleType, true), + StructField("petal length", DoubleType, true), + StructField("petal width", DoubleType, true), + StructField("class", StringType, true))) + val rawInput = spark.read.schema(schema).csv("input_path") +``` + +At the first line, we create a instance of [SparkSession](http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession) + which is the entry of any Spark program working with DataFrame. The `schema` variable defines the schema of DataFrame wrapping + Iris data from csv file. With this explicitly set schema, we can define the columns' name as well as their types. Finally, we can + use the built-in csv reader to load Iris csv file as a DataFrame named `rawInput`. + +### Transform Raw Iris Dataset + +To make Iris dataset be recognizable to XGBoost, we need to + +1. Transform String-typed label, i.e. "class", to Integer-typed label. + +2. Assemble the feature columns as a vector to build XGBoost's internal data representation, DMatrix. + +To convert String-typed label to Integer, we can use Spark's built-in feature transformer StringIndexer. + +```scala + import org.apache.spark.ml.feature.StringIndexer + val stringIndexer = new StringIndexer(). + setInputCol("class"). + setOutputCol("classIndex"). + fit(rawInput) + val labelTransformed = stringIndexer.transform(rawInput).drop("class") +``` + +To create a StringIndexer, we set input column, i.e. the column containing String-typed label, and output column, +i.e. the column to contain the Integer-typed label. Then we `fit` StringIndex with our input DataFrame so that Spark internals can +get information like total number of distinct values, etc. Now we have a StringIndexer ready to be applied to our input DataFrame. + +To execute the transformation logic of StringIndexer, we `transform` the input DataFrame with the StringIndexer and to keep simplicity, +we drop the column `class` which contains the original String-typed labels. + +`fit` and `transform` are two key operations in MLLIB. Basically, `fit` produces a "transformer", e.g. StringIndexer, and each +transformer apply `transform` method on dataset to add new column which contains transformed features/labels or prediction results, etc. +You can find more details in [here](http://spark.apache.org/docs/latest/ml-pipeline.html#pipeline-components). + +Similarly, we can use another transformer, 'VectorAssembler', to assemble feature columns "sepal length", "sepal width", +"petal length" and "petal width" as a vector. + +```scala + import org.apache.spark.ml.feature.VectorAssembler + val vectorAssembler = new VectorAssembler(). + setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")). + setOutputCol("features") + val xgbInput = vectorAssembler.transform(labelTransformed).select("features", + "classIndex") +``` + +Now, we have a DataFrame containing only two columns, "features" which contains vector-represented +"sepal length", "sepal width", "petal length" and "petal width"; and also "classIndex" which has Integer-typed +labels. This DataFrame can be feed to train a XGBoost model directly. + +## Training + +XGBoost support both Regression and Classification. In this doc we use Iris dataset to show the usage of XGBoost +in the case of multi-class Classification. The usage in Regression is very similar with Classification. + +To train a XGBoost model for classification, we need to claim a XGBoostClassifier first: + +```scala + import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") +``` + + + +### Current Version of Gang Scheduling + +based on spark even listener + +### Checkpoint Support + +## Prediction + +Highlight the recommended way (batching prediction) + +briefly talk about single-instance prediction + +## Model Persistence + +(also talk about how to train a model in Spark and use it in python environment) + +# Building a ML Pipeline with XGBoost4J-Spark + + + + + + + From ff84cf241ca1003b6f29ef96c81e181bc71ed569 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Wed, 25 Jul 2018 22:43:13 -0700 Subject: [PATCH 06/25] update doc --- jvm-packages/xgboost4j-spark/docs/index.md | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/jvm-packages/xgboost4j-spark/docs/index.md index 4ed4092186ed..5bada1b23a4b 100644 --- a/jvm-packages/xgboost4j-spark/docs/index.md +++ b/jvm-packages/xgboost4j-spark/docs/index.md @@ -110,7 +110,56 @@ To train a XGBoost model for classification, we need to claim a XGBoostClassifie setLabelCol("classIndex") ``` +The available parameters for training a XGBoost model can be found in [here](https://xgboost.readthedocs.io/en/latest/parameter.html). +In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case-variance of these parameters to keep consistent +with Spark's MLLIB parameters. Specifically, each parameter in [here](https://xgboost.readthedocs.io/en/latest/parameter.html) has its +equivalent form in XGBoost4J-Spark with camel case. For example, to set max_depth for each tree, you can pass parameter just like what we + do in the above code snippet, or you can do it through setters in XGBoostClassifer: + + ```scala + val xgbClassifier1 = new XGBoostClassifier(). + setFeaturesCol("features"). + setLabelCol("classIndex") + xgbClassifier1.setMaxDeltaStep(2) + ``` + +After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel, and apply +transformation to the DataFrame containing training set, i.e. xgbInput. + +```scala + val xgbClassificationModel = xgbClassifier.fit(xgbInput) + val results = xgbClassificationModel.transform(xgbInput) +``` +Now, we get a DataFrame, result, containing margin, probability for each class and the prediction for each instance + +```scala ++-----------------+----------+--------------------+--------------------+----------+ +| features|classIndex| rawPrediction| probability|prediction| ++-----------------+----------+--------------------+--------------------+----------+ +|[5.1,3.5,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[4.9,3.0,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[4.7,3.2,1.3,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| +|[4.6,3.1,1.5,0.2]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| +|[5.0,3.6,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[5.4,3.9,1.7,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| +|[4.6,3.4,1.4,0.3]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| +|[5.0,3.4,1.5,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[4.4,2.9,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[4.9,3.1,1.5,0.1]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| +|[5.4,3.7,1.5,0.2]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| +|[4.8,3.4,1.6,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| +|[4.8,3.0,1.4,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[4.3,3.0,1.1,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[5.8,4.0,1.2,0.2]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| +|[5.7,4.4,1.5,0.4]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| +|[5.4,3.9,1.3,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| +|[5.1,3.5,1.4,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[5.7,3.8,1.7,0.3]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| +|[5.1,3.8,1.5,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| ++-----------------+----------+--------------------+--------------------+----------+ + +``` ### Current Version of Gang Scheduling From 395b2a9fa3ff80389786b75593d17c81dc0c7d8b Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Wed, 25 Jul 2018 23:37:21 -0700 Subject: [PATCH 07/25] finish Gang Scheduling --- jvm-packages/xgboost4j-spark/docs/index.md | 56 +++++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/jvm-packages/xgboost4j-spark/docs/index.md index 5bada1b23a4b..de7894cab396 100644 --- a/jvm-packages/xgboost4j-spark/docs/index.md +++ b/jvm-packages/xgboost4j-spark/docs/index.md @@ -161,12 +161,64 @@ Now, we get a DataFrame, result, containing margin, probability for each class a ``` -### Current Version of Gang Scheduling +### Parallel/Distributed Training -based on spark even listener +One of the most important parameters we set for XGBoostClassifier is "num_workers" (or "numWorkers"). +This parameter controls how many parallel workers we want to have when training a XGBoostClassificationModel. + +In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task. By default, we allocate a core per each XGBoost worker. +Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved + by running multiple workers (i.e. Spark tasks) at the same time. + + If you do want OpenMP optimization, you have to + + 1. set `nthread` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor + + 2. set `spark.task.cpus` in Spark to the same value as `nthread` + +### Run XGBoost4J-Spark in Production + +XGBoost4J-Spark has attracted a lot of users from industry and is deployed in many production environments. We also include many features +enabling running XGBoost4J-Spark in production smoothly. + +#### Gang Scheduling + +XGBoost uses [AllReduce](http://mpitutorial.com/tutorials/mpi-reduce-and-allreduce/) + to synchronize the stats of each worker. Therefore XGBoost4J-Spark requires that all of `nthread * numWorkers` cores + should be available before the training runs. + +However, in production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark can get +all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more +cores to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. + Additionally, this usually happens silently and does not bring the attention of users. + + XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get + enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, + you can set with XGBoostClassifier: + + ```scala + xgbClassifier.setTimeoutRequestWorkers() + ``` + + or pass in `timeout_request_workers` in xgbParamMap when building XGBoostClassifier + + ```scala + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2, + "timeout_request_workers" -> 60000L) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") + ``` ### Checkpoint Support + + ## Prediction Highlight the recommended way (batching prediction) From 8c713e55fce96154c926797378dd9fd2b6183b1a Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Thu, 26 Jul 2018 00:02:42 -0700 Subject: [PATCH 08/25] more --- jvm-packages/xgboost4j-spark/docs/index.md | 67 +++++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/jvm-packages/xgboost4j-spark/docs/index.md index de7894cab396..22a8a508749b 100644 --- a/jvm-packages/xgboost4j-spark/docs/index.md +++ b/jvm-packages/xgboost4j-spark/docs/index.md @@ -4,7 +4,41 @@ # Build an Application with XGBoost4J-Spark -(based on maven build), step by step (structure of program) +To build a Spark application with XGBoost4J-Spark, you first need to refer to the dependency in maven_central, + +You can add the following dependency in your pom file. + +```xml + + ml.dmlc + xgboost4j-spark + latest_version_num + +``` + +For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases). + +We also publish some functionalities which would be included in the coming release in the form of snapshot version. To access +these functionalities, you can refer the dependency to snapshot artifacts. We publish snapshot version in github-based repo, so +you first need to add the following repo in pom.xml: + +```xml + + GitHub Repo + GitHub Repo + https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/ + +``` + +and then refer to the snapshot dependency by adding: + +```xml + + ml.dmlc + xgboost4j + next_version_num-SNAPSHOT + +``` ## Data Preparation @@ -215,9 +249,38 @@ cores to be available. This process usually brings unnecessary resource waste as setLabelCol("classIndex") ``` -### Checkpoint Support +#### Checkpoint During Training +Transient Failures are commonly seen in production environment. To simplify the design of XGBoost, + we stop training if any of the distributed workers fail. Additionally, to efficiently recover failed training, we support + checkpoint mechanism to facilitate failure recovery. + + To enable this feature, you can set how many iterations we build each checkpoint with `setCheckpointInterval` and + the path store checkpointPath with `setCheckpointPath`: + + ```scala + xgbClassifier.setCheckpointInterval(2) + xgbClassifier.setCheckpointPath("/checkpoint_path") + ``` + + an equivalent way is to pass in parameters in XGBoostClassifier's constructor: + + ```scala + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2, + "checkpoint_path" -> "/checkpoints", + "checkpoint_interval" -> 2) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") + ``` +If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file +in `/checkpoints` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. ## Prediction From dbd5d9fb78c543ca86b61a88f58878afbde6dbcd Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Thu, 26 Jul 2018 00:20:28 -0700 Subject: [PATCH 09/25] intro --- jvm-packages/xgboost4j-spark/docs/index.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/jvm-packages/xgboost4j-spark/docs/index.md index 22a8a508749b..d987b5ccc699 100644 --- a/jvm-packages/xgboost4j-spark/docs/index.md +++ b/jvm-packages/xgboost4j-spark/docs/index.md @@ -1,7 +1,13 @@ # XGBoost4J-Spark -(3 - 4 sentences intro) - +XGBoost4J-Spark is a project aiming to seamlessly integrate XGBoost and Spark's MLLIB framework. With the integration, + user can not only using the highly performant algorithm implementation of XGBoost, but also it leverages the existing tools for user to + easily work on: + + * Feature Engineering: feature extraction, transformation, dimensionality reduction, and selection + * Pipelines: constructing, evaluating, and tuning ML Pipelines + * Persistence: persist and load machine learning models and even whole Pipelines + # Build an Application with XGBoost4J-Spark To build a Spark application with XGBoost4J-Spark, you first need to refer to the dependency in maven_central, From fdec071783d568fe144379afb672cfad8bdea9c8 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 27 Jul 2018 22:12:47 -0700 Subject: [PATCH 10/25] Add sections: Prediction, Model persistence and ML pipeline. --- jvm-packages/xgboost4j-spark/docs/index.md | 230 +++++++++++++++++---- 1 file changed, 192 insertions(+), 38 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/jvm-packages/xgboost4j-spark/docs/index.md index d987b5ccc699..5886377bb386 100644 --- a/jvm-packages/xgboost4j-spark/docs/index.md +++ b/jvm-packages/xgboost4j-spark/docs/index.md @@ -166,41 +166,6 @@ equivalent form in XGBoost4J-Spark with camel case. For example, to set max_dept After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel, and apply transformation to the DataFrame containing training set, i.e. xgbInput. -```scala - val xgbClassificationModel = xgbClassifier.fit(xgbInput) - val results = xgbClassificationModel.transform(xgbInput) -``` - -Now, we get a DataFrame, result, containing margin, probability for each class and the prediction for each instance - -```scala -+-----------------+----------+--------------------+--------------------+----------+ -| features|classIndex| rawPrediction| probability|prediction| -+-----------------+----------+--------------------+--------------------+----------+ -|[5.1,3.5,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[4.9,3.0,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[4.7,3.2,1.3,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| -|[4.6,3.1,1.5,0.2]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| -|[5.0,3.6,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[5.4,3.9,1.7,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| -|[4.6,3.4,1.4,0.3]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| -|[5.0,3.4,1.5,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[4.4,2.9,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[4.9,3.1,1.5,0.1]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| -|[5.4,3.7,1.5,0.2]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| -|[4.8,3.4,1.6,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| -|[4.8,3.0,1.4,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[4.3,3.0,1.1,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[5.8,4.0,1.2,0.2]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| -|[5.7,4.4,1.5,0.4]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| -|[5.4,3.9,1.3,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| -|[5.1,3.5,1.4,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[5.7,3.8,1.7,0.3]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| -|[5.1,3.8,1.5,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -+-----------------+----------+--------------------+--------------------+----------+ - -``` - ### Parallel/Distributed Training One of the most important parameters we set for XGBoostClassifier is "num_workers" (or "numWorkers"). @@ -290,16 +255,205 @@ in `/checkpoints` and start from the iteration when the checkpoint was built unt ## Prediction -Highlight the recommended way (batching prediction) +XGBoost4j-Spark supports two way for model serving: batch prediction and single instance prediction. + +### Batch prediction + +When we get a model, either `XGBoostClassificationModel` or `XGBoostRegressionModel`, it takes a DataFrame, read the column containing feature vectors, +predict for each feature vector, and output a new DataFrame with the following columns by default: + +* `XGBoostClassificationModel` will output raw predictions for each possible label(`rawPredictionCol`), + the probability of each possible label(`probabilityCol`), and the predicted label(`predictionCol`). +* `XGBoostRegressionModel` will output predicted label(`predictionCol`). + +```scala + val xgbClassificationModel = xgbClassifier.fit(xgbInput) + val results = xgbClassificationModel.transform(xgbInput) +``` + +Now, we get a DataFrame, result containing margin, probability for each class and the prediction for each instance + +```scala ++-----------------+----------+--------------------+--------------------+----------+ +| features|classIndex| rawPrediction| probability|prediction| ++-----------------+----------+--------------------+--------------------+----------+ +|[5.1,3.5,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[4.9,3.0,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[4.7,3.2,1.3,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| +|[4.6,3.1,1.5,0.2]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| +|[5.0,3.6,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[5.4,3.9,1.7,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| +|[4.6,3.4,1.4,0.3]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| +|[5.0,3.4,1.5,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[4.4,2.9,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[4.9,3.1,1.5,0.1]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| +|[5.4,3.7,1.5,0.2]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| +|[4.8,3.4,1.6,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| +|[4.8,3.0,1.4,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[4.3,3.0,1.1,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[5.8,4.0,1.2,0.2]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| +|[5.7,4.4,1.5,0.4]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| +|[5.4,3.9,1.3,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| +|[5.1,3.5,1.4,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[5.7,3.8,1.7,0.3]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| +|[5.1,3.8,1.5,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| ++-----------------+----------+--------------------+--------------------+----------+ + +``` -briefly talk about single-instance prediction +### Single instance prediction + +`XGBoostClassificationModel` or `XGBoostRegressionModel` support make prediction on single instance as well. +It accepts a single Vector as feature, and output the predicted double label. +However, this function's performance is not ideal, use it carefully! + +```scala + val features = xgbInput.head().getAs[Vector]("features") + val result = xgbClassificationModel.predict(features) +``` ## Model Persistence -(also talk about how to train a model in Spark and use it in python environment) +### Model and pipeline persistence + +A data scientist produces an ML model and hands it over to an engineering team for deployment in a production environment. +So it's important to support model persistence. + +XGBoost4j-Spark supports save/load `XGBoostClassifier`/`XGBoostClassificationModel` and `XGBoostRegressor`/`XGBoostRegressionModel`, +it also support save/load a ML pipeline which includes these estimators and models. + +We can save the XGBoostClassificationModel to file system: + +```scala + val xgbClassificationModelPath = "/tmp/xgbClassificationModel" + xgbClassificationModel.write.overwrite().save(xgbClassificationModelPath) +``` + +and then loading the model in another session: + +```scala + import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel + + val xgbClassificationModel2 = XGBoostClassificationModel.load(xgbClassificationModelPath) + xgbClassificationModel2.transform(xgbInput) +``` + +With regards to ML pipeline save and load, please refer the next section. + +### Export to local + +After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine +or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by: + +```scala + val nativeModelPath = "/tmp/nativeModel" + xgbClassificationModel.nativeBooster.saveModel(nativeModelPath) +``` + +Then we can load this model with single node Python XGBoost: + +```python + import xgboost as xgb + bst = xgb.Booster({'nthread': 4}) + bst.load_model(nativeModelPath) +``` # Building a ML Pipeline with XGBoost4J-Spark +## Basic ML Pipeline + +Spark ML pipeline can combine multiple algorithms or functions into a single pipeline. +It covers from feature extraction/transformation/selection to model training/prediction. +XGBoost4j-Spark makes it feasible to embed XGBoost into such a pipeline seamlessly. +The following example shows how to build such a pipeline consisting of Spark MLlib feature transformer +and XGBoostClassifier estimator. + +We still use [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset and the ```rawInput``` DataFrame. +First we need to split the dataset into training and test dataset. + +```scala + val Array(training, test) = rawInput.randomSplit(Array(0.8, 0.2), 123) +``` + +The we build the ML `Pipeline` which includes 4 stages: +* Assemble all features into a single vector column. +* From string label to indexed double label. +* Use `XGBoostClassifier` to train classification model. +* Convert indexed double label back to original string label. + +And start to run this `Pipeline` and get a `PipelineModel`: + +```scala + import org.apache.spark.ml.feature._ + import org.apache.spark.ml.Pipeline + + val assembler = new VectorAssembler() + .setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")) + .setOutputCol("features") + val labelIndexer = new StringIndexer() + .setInputCol("species") + .setOutputCol("label") + .fit(training) + val booster = new XGBoostClassifier( + Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2 + ) + ) + val labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("realLabel") + .setLabels(labelIndexer.labels) + + val pipeline = new Pipeline() + .setStages(Array(assembler, labelIndexer, booster, labelConverter)) + val model = pipeline.fit(training) +``` + +After we get the PipelineModel, we can make prediction on the test dataset and evaluate the model accuracy. + +```scala + import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator + + val prediction = model.transform(test) + val evaluator = new MulticlassClassificationEvaluator() + val accuracy = evaluator.evaluate(prediction) +``` + +## Pipeline with Hyper-parameter Tunning + +The most critical operation to maximize the power of XGBoost is to select the optimal parameters for the model. +Tuning parameters manually is a tedious and labor-consuming process. With the latest version of XGBoost4J-Spark, +we can utilize the Spark model selecting tool to automate this process. + +The following example shows the code snippet utilizing `CrossValidation` and `MulticlassClassificationEvaluator` +to search the optimal combination of two XGBoost parameters, [`max_depth` and `eta`](https://github.com/dmlc/xgboost/blob/master/doc/parameter.md). +The model producing the maximum accuracy defined by `MulticlassClassificationEvaluator` is selected and used to generate the prediction for the test set. + +```scala + import org.apache.spark.ml.tuning._ + import org.apache.spark.ml.PipelineModel + import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel + + val paramGrid = new ParamGridBuilder() + .addGrid(booster.maxDepth, Array(3, 8)) + .addGrid(booster.eta, Array(0.2, 0.6)) + .build() + val cv = new CrossValidator() + .setEstimator(pipeline) + .setEvaluator(evaluator) + .setEstimatorParamMaps(paramGrid) + .setNumFolds(3) + + val cvModel = cv.fit(training) + + val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2) + .asInstanceOf[XGBoostClassificationModel] + bestModel.extractParamMap() +``` From 0e3e71de403510fa8ebfdb137390ad5793bc8a78 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 27 Jul 2018 22:13:53 -0700 Subject: [PATCH 11/25] Add XGBoost4j-Spark MLlib pipeline example --- .../example/spark/SparkMLlibPipeline.scala | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala new file mode 100644 index 000000000000..e9c575b3214b --- /dev/null +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala @@ -0,0 +1,129 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package ml.dmlc.xgboost4j.scala.example.spark + +import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.apache.spark.ml.feature._ +import org.apache.spark.ml.tuning._ +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types._ + +import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel} + +object SparkMLlibPipeline { + + def main(args: Array[String]): Unit = { + + if (args.length != 1) { + println("Usage: SparkMLlibPipeline input_path native_model_path pipeline_model_path") + sys.exit(1) + } + + val inputPath = args(0) + val nativeModelPath = args(1) + val pipelineModelPath = args(2) + + val spark = SparkSession + .builder() + .appName("XGBoost4J-Spark Pipeline Example") + .getOrCreate() + + // Load dataset + val schema = new StructType(Array( + StructField("sepal length", DoubleType, true), + StructField("sepal width", DoubleType, true), + StructField("petal length", DoubleType, true), + StructField("petal width", DoubleType, true), + StructField("species", StringType, true))) + + val rawInput = spark.read.schema(schema).csv(inputPath) + + // Split training and test dataset + val Array(training, test) = rawInput.randomSplit(Array(0.8, 0.2), 123) + + // Build ML pipeline, it includes 4 stages: + // 1, Assemble all features into a single vector column. + // 2, From string label to indexed double label. + // 3, Use XGBoostClassifier to train classification model. + // 4, Convert indexed double label back to original string label. + val assembler = new VectorAssembler() + .setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")) + .setOutputCol("features") + val labelIndexer = new StringIndexer() + .setInputCol("species") + .setOutputCol("label") + .fit(training) + val booster = new XGBoostClassifier( + Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2 + ) + ) + val labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("realLabel") + .setLabels(labelIndexer.labels) + + val pipeline = new Pipeline() + .setStages(Array(assembler, labelIndexer, booster, labelConverter)) + val model = pipeline.fit(training) + + // Batch prediction + val prediction = model.transform(test) + prediction.show(false) + + // Model evaluation + val evaluator = new MulticlassClassificationEvaluator() + val accuracy = evaluator.evaluate(prediction) + println("The model accuracy is : " + accuracy) + + // Tune model using cross validation + val paramGrid = new ParamGridBuilder() + .addGrid(booster.maxDepth, Array(3, 8)) + .addGrid(booster.eta, Array(0.2, 0.6)) + .build() + val cv = new CrossValidator() + .setEstimator(pipeline) + .setEvaluator(evaluator) + .setEstimatorParamMaps(paramGrid) + .setNumFolds(3) + + val cvModel = cv.fit(training) + + val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2) + .asInstanceOf[XGBoostClassificationModel] + println("The params of best XGBoostClassification model : " + + bestModel.extractParamMap()) + println("The training summary of best XGBoostClassificationModel : " + + bestModel.summary) + + // Export the XGBoostClassificationModel as local XGBoost model, + // then you can load it back in local Python environment. + bestModel.nativeBooster.saveModel(nativeModelPath) + + // ML pipeline persistence + model.write.overwrite().save(pipelineModelPath) + + // Load a saved model and serving + val model2 = PipelineModel.load(pipelineModelPath) + model2.transform(test).show(false) + } +} \ No newline at end of file From 3a849d9916c00b6bcac274b9926745fe5f1e7b02 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Sat, 28 Jul 2018 15:06:40 -0700 Subject: [PATCH 12/25] partial finished version --- jvm-packages/xgboost4j-spark/docs/index.md | 306 +++++++++++---------- 1 file changed, 160 insertions(+), 146 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/jvm-packages/xgboost4j-spark/docs/index.md index 5886377bb386..86994ea8c847 100644 --- a/jvm-packages/xgboost4j-spark/docs/index.md +++ b/jvm-packages/xgboost4j-spark/docs/index.md @@ -1,16 +1,24 @@ -# XGBoost4J-Spark +# XGBoost4J-Spark Tutorial (version >= 0.8) -XGBoost4J-Spark is a project aiming to seamlessly integrate XGBoost and Spark's MLLIB framework. With the integration, - user can not only using the highly performant algorithm implementation of XGBoost, but also it leverages the existing tools for user to - easily work on: +XGBoost4J-Spark is a project aiming to seamlessly integrate XGBoost and Apache Spark by fitting XGBoost to Apache Spark's MLLIB framework. With the integration, user can not only uses the high-performant algorithm implementation of XGBoost, but also leverages the powerful data processing engine of Spark for: - * Feature Engineering: feature extraction, transformation, dimensionality reduction, and selection + * Feature Engineering: feature extraction, transformation, dimensionality reduction, and selection, etc. * Pipelines: constructing, evaluating, and tuning ML Pipelines * Persistence: persist and load machine learning models and even whole Pipelines + +This tutorial is to cover the end-to-end process to build a machine learning pipeline with XGBoost4J-Spark. We will discuss + + * Using Spark to preprocess data to fit to XGBoost/XGBoost4J-Spark's data interface + * Training a XGBoost model with XGBoost4J-Spark + * Serving XGBoost model (prediction) with Spark + * Building a Machine Learning Pipeline with XGBoost4J-Spark + * Running XGBoost4J-Spark in Production -# Build an Application with XGBoost4J-Spark +# Build an ML Application with XGBoost4J-Spark + +## Refer to XGBoost4J-Spark Dependency -To build a Spark application with XGBoost4J-Spark, you first need to refer to the dependency in maven_central, +Before we go into the tour of how to use XGBoost4J-Spark, we would bring a brief introduction about how to build a machine learning application with XGBoost4J-Spark. The first thing you need to do is to refer to the dependency in Maven Central. You can add the following dependency in your pom file. @@ -24,9 +32,7 @@ You can add the following dependency in your pom file. For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases). -We also publish some functionalities which would be included in the coming release in the form of snapshot version. To access -these functionalities, you can refer the dependency to snapshot artifacts. We publish snapshot version in github-based repo, so -you first need to add the following repo in pom.xml: +We also publish some functionalities which would be included in the coming release in the form of snapshot version. To access these functionalities, you can add dependency to the snapshot artifacts. We publish snapshot version in github-based repo, so you can add the following repo in pom.xml: ```xml @@ -46,6 +52,7 @@ and then refer to the snapshot dependency by adding: ``` + ## Data Preparation As aforementioned, XGBoost4J-Spark seamlessly integrates Spark and XGBoost. The integration enables @@ -53,14 +60,12 @@ As aforementioned, XGBoost4J-Spark seamlessly integrates Spark and XGBoost. The and powerful data processing framework, Spark. In this section, we use [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset as an example to - showcase how we use Spark to transform raw dataset and make it fit the requirement of XGBoost. + showcase how we use Spark to transform raw dataset and make it fit to the data interface of XGBoost. Iris dataset is shipped in CSV format. Each instance contains 4 features, "sepal length", "sepal width", -"petal length" and "petal width". "class" column in each instance is -essentially the label which has three distinct values: "Iris Setosa", "Iris Versicolour" -and "Iris Virginica". +"petal length" and "petal width". "class" column in each instance is essentially the label which has three possible values: "Iris Setosa", "Iris Versicolour" and "Iris Virginica". -### Read Dataset with Spark Built-In CSV Reader +### Read Dataset with Spark's Built-In Reader The first thing in data transformation is to load the dataset as Spark's structured data abstraction, DataFrame. @@ -79,10 +84,9 @@ DataFrame. val rawInput = spark.read.schema(schema).csv("input_path") ``` -At the first line, we create a instance of [SparkSession](http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession) - which is the entry of any Spark program working with DataFrame. The `schema` variable defines the schema of DataFrame wrapping - Iris data from csv file. With this explicitly set schema, we can define the columns' name as well as their types. Finally, we can - use the built-in csv reader to load Iris csv file as a DataFrame named `rawInput`. +At the first line, we create a instance of [SparkSession](http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession) which is the entry of any Spark program working with DataFrame. The `schema` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types, otherwise the column name would be the default ones derived by Spark, such as `_col0`, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named `rawInput`. + +Spark also contains many built-in readers for other format. The latest version of Spark supports, csv/json/parquet/libsvm. ### Transform Raw Iris Dataset @@ -103,19 +107,18 @@ To convert String-typed label to Integer, we can use Spark's built-in feature tr val labelTransformed = stringIndexer.transform(rawInput).drop("class") ``` -To create a StringIndexer, we set input column, i.e. the column containing String-typed label, and output column, -i.e. the column to contain the Integer-typed label. Then we `fit` StringIndex with our input DataFrame so that Spark internals can -get information like total number of distinct values, etc. Now we have a StringIndexer ready to be applied to our input DataFrame. +With a newly created StringIndexer instance: + +1. we set input column, i.e. the column containing String-typed label +2. we set output column, i.e. the column to contain the Integer-typed label. +3. Then we `fit` StringIndex with our input DataFrame, 'rawInput', so that Spark internals can get information like total number of distinct values, etc. -To execute the transformation logic of StringIndexer, we `transform` the input DataFrame with the StringIndexer and to keep simplicity, -we drop the column `class` which contains the original String-typed labels. +Now we have a StringIndexer which is ready to be applied to our input DataFrame. To execute the transformation logic of StringIndexer, we `transform` the input DataFrame, 'rawInput' and to keep a concise DataFrame, +we drop the column `class` and only keeps the feature columns and the transformed Integer-typed label column (in the last line of the above code snippet). -`fit` and `transform` are two key operations in MLLIB. Basically, `fit` produces a "transformer", e.g. StringIndexer, and each -transformer apply `transform` method on dataset to add new column which contains transformed features/labels or prediction results, etc. -You can find more details in [here](http://spark.apache.org/docs/latest/ml-pipeline.html#pipeline-components). +`fit` and `transform` are two key operations in MLLIB. Basically, `fit` produces a "transformer", e.g. StringIndexer, and each transformer applies `transform` method on DataFrame to add new column(s) containing transformed features/labels or prediction results, etc. To understand more about `fit` and `transform`, You can find more details in [here](http://spark.apache.org/docs/latest/ml-pipeline.html#pipeline-components). -Similarly, we can use another transformer, 'VectorAssembler', to assemble feature columns "sepal length", "sepal width", -"petal length" and "petal width" as a vector. +Similarly, we can use another transformer, 'VectorAssembler', to assemble feature columns "sepal length", "sepal width", "petal length" and "petal width" as a vector. ```scala import org.apache.spark.ml.feature.VectorAssembler @@ -127,13 +130,13 @@ Similarly, we can use another transformer, 'VectorAssembler', to assemble featur ``` Now, we have a DataFrame containing only two columns, "features" which contains vector-represented -"sepal length", "sepal width", "petal length" and "petal width"; and also "classIndex" which has Integer-typed -labels. This DataFrame can be feed to train a XGBoost model directly. +"sepal length", "sepal width", "petal length" and "petal width" and "classIndex" which has Integer-typed +labels. A DataFrame like this (containing vector-represented features and numeric labels) can be fed to XGBoost4J-Spark's training engine directly. + ## Training -XGBoost support both Regression and Classification. In this doc we use Iris dataset to show the usage of XGBoost -in the case of multi-class Classification. The usage in Regression is very similar with Classification. +XGBoost support both Regression and Classification. While we use Iris dataset in this tutorial to show how we use XGBoost/XGBoost4J-Spark to resolve a multi-classes Classification problem, The usage in Regression is very similar to Classification. To train a XGBoost model for classification, we need to claim a XGBoostClassifier first: @@ -150,128 +153,43 @@ To train a XGBoost model for classification, we need to claim a XGBoostClassifie setLabelCol("classIndex") ``` -The available parameters for training a XGBoost model can be found in [here](https://xgboost.readthedocs.io/en/latest/parameter.html). -In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case-variance of these parameters to keep consistent -with Spark's MLLIB parameters. Specifically, each parameter in [here](https://xgboost.readthedocs.io/en/latest/parameter.html) has its -equivalent form in XGBoost4J-Spark with camel case. For example, to set max_depth for each tree, you can pass parameter just like what we - do in the above code snippet, or you can do it through setters in XGBoostClassifer: +The available parameters for training a XGBoost model can be found in [here](https://xgboost.readthedocs.io/en/latest/parameter.html). In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case-variance of these parameters to keep consistent with Spark's MLLIB parameters. + +Specifically, each parameter in [here](https://xgboost.readthedocs.io/en/latest/parameter.html) has its +equivalent form in XGBoost4J-Spark with camel case. For example, to set max_depth for each tree, you can pass parameter just like what we did in the above code snippet (as `max_depth` wrapped in a Map), or you can do it through setters in XGBoostClassifer: ```scala - val xgbClassifier1 = new XGBoostClassifier(). + val xgbClassifier = new XGBoostClassifier(). setFeaturesCol("features"). setLabelCol("classIndex") - xgbClassifier1.setMaxDeltaStep(2) - ``` - -After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel, and apply -transformation to the DataFrame containing training set, i.e. xgbInput. - -### Parallel/Distributed Training - -One of the most important parameters we set for XGBoostClassifier is "num_workers" (or "numWorkers"). -This parameter controls how many parallel workers we want to have when training a XGBoostClassificationModel. - -In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task. By default, we allocate a core per each XGBoost worker. -Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved - by running multiple workers (i.e. Spark tasks) at the same time. - - If you do want OpenMP optimization, you have to - - 1. set `nthread` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor - - 2. set `spark.task.cpus` in Spark to the same value as `nthread` - -### Run XGBoost4J-Spark in Production - -XGBoost4J-Spark has attracted a lot of users from industry and is deployed in many production environments. We also include many features -enabling running XGBoost4J-Spark in production smoothly. - -#### Gang Scheduling - -XGBoost uses [AllReduce](http://mpitutorial.com/tutorials/mpi-reduce-and-allreduce/) - to synchronize the stats of each worker. Therefore XGBoost4J-Spark requires that all of `nthread * numWorkers` cores - should be available before the training runs. - -However, in production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark can get -all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more -cores to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. - Additionally, this usually happens silently and does not bring the attention of users. - - XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get - enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, - you can set with XGBoostClassifier: - - ```scala - xgbClassifier.setTimeoutRequestWorkers() - ``` - - or pass in `timeout_request_workers` in xgbParamMap when building XGBoostClassifier - - ```scala - val xgbParam = Map("eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "multi:softprob", - "num_class" -> 3, - "num_round" -> 100, - "num_workers" -> 2, - "timeout_request_workers" -> 60000L) - val xgbClassifier = new XGBoostClassifier(xgbParam). - setFeaturesCol("features"). - setLabelCol("classIndex") + xgbClassifier.setMaxDeltaStep(2) ``` -#### Checkpoint During Training - -Transient Failures are commonly seen in production environment. To simplify the design of XGBoost, - we stop training if any of the distributed workers fail. Additionally, to efficiently recover failed training, we support - checkpoint mechanism to facilitate failure recovery. - - To enable this feature, you can set how many iterations we build each checkpoint with `setCheckpointInterval` and - the path store checkpointPath with `setCheckpointPath`: - - ```scala - xgbClassifier.setCheckpointInterval(2) - xgbClassifier.setCheckpointPath("/checkpoint_path") - ``` - - an equivalent way is to pass in parameters in XGBoostClassifier's constructor: - - ```scala - val xgbParam = Map("eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "multi:softprob", - "num_class" -> 3, - "num_round" -> 100, - "num_workers" -> 2, - "checkpoint_path" -> "/checkpoints", - "checkpoint_interval" -> 2) - val xgbClassifier = new XGBoostClassifier(xgbParam). - setFeaturesCol("features"). - setLabelCol("classIndex") - ``` +After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel by fitting XGBoostClassifier with the input DataFrame. This `fit` operation is essentially the training process and the generated model can then be used in Prediction. -If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file -in `/checkpoints` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. +```scala + val xgbClassificationModel = xgbClassifier.fit(xgbInput) +``` ## Prediction -XGBoost4j-Spark supports two way for model serving: batch prediction and single instance prediction. +XGBoost4j-Spark supports two ways for model serving: batch prediction and single instance prediction. + +### Batch Prediction -### Batch prediction +When we get a model, either `XGBoostClassificationModel` or `XGBoostRegressionModel`, it takes a DataFrame, read the column containing feature vectors, predict for each feature vector, and output a new DataFrame with the following columns by default: -When we get a model, either `XGBoostClassificationModel` or `XGBoostRegressionModel`, it takes a DataFrame, read the column containing feature vectors, -predict for each feature vector, and output a new DataFrame with the following columns by default: +* `XGBoostClassificationModel` will output margins (`rawPredictionCol`), probabilities(`probabilityCol`) and the eventual prediction labels (`predictionCol`) for each possible label. +* `XGBoostRegressionModel` will output prediction label(`predictionCol`). -* `XGBoostClassificationModel` will output raw predictions for each possible label(`rawPredictionCol`), - the probability of each possible label(`probabilityCol`), and the predicted label(`predictionCol`). -* `XGBoostRegressionModel` will output predicted label(`predictionCol`). +Batch Prediction expects the user to pass the testset in the form of a DataFrame. XGBoost4J-Spark starts a XGBoost worker for each partition of DataFrame for parallel prediction and generates prediction results for the whole DataFrame in a batch. ```scala val xgbClassificationModel = xgbClassifier.fit(xgbInput) - val results = xgbClassificationModel.transform(xgbInput) + val results = xgbClassificationModel.transform(testSet) ``` -Now, we get a DataFrame, result containing margin, probability for each class and the prediction for each instance +With the above code snippet, we get a result DataFrame, result containing margin, probability for each class and the prediction for each instance ```scala +-----------------+----------+--------------------+--------------------+----------+ @@ -304,8 +222,9 @@ Now, we get a DataFrame, result containing margin, probability for each class an ### Single instance prediction `XGBoostClassificationModel` or `XGBoostRegressionModel` support make prediction on single instance as well. -It accepts a single Vector as feature, and output the predicted double label. -However, this function's performance is not ideal, use it carefully! +It accepts a single Vector as feature, and output the prediction label. + +However, the overhead of single-instance prediction is high due to the internal overhead of XGBoost, use it carefully! ```scala val features = xgbInput.head().getAs[Vector]("features") @@ -316,11 +235,9 @@ However, this function's performance is not ideal, use it carefully! ### Model and pipeline persistence -A data scientist produces an ML model and hands it over to an engineering team for deployment in a production environment. -So it's important to support model persistence. +A data scientist produces an ML model and hands it over to an engineering team for deployment in a production environment. Reversely, a trained model may be used by data scientists, for example as a baseline, across the process of data exploration. So it's important to support model persistence to make the models available across usage scenarios and programming languages. -XGBoost4j-Spark supports save/load `XGBoostClassifier`/`XGBoostClassificationModel` and `XGBoostRegressor`/`XGBoostRegressionModel`, -it also support save/load a ML pipeline which includes these estimators and models. +XGBoost4j-Spark supports save/load `XGBoostClassifier`/`XGBoostClassificationModel` and `XGBoostRegressor`/`XGBoostRegressionModel`, it also support save/load a ML pipeline which includes these estimators and models. We can save the XGBoostClassificationModel to file system: @@ -340,10 +257,9 @@ and then loading the model in another session: With regards to ML pipeline save and load, please refer the next section. -### Export to local +### Export for Other Bindings of XGBoost -After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine -or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by: +After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by: ```scala val nativeModelPath = "/tmp/nativeModel" @@ -358,6 +274,18 @@ Then we can load this model with single node Python XGBoost: bst.load_model(nativeModelPath) ``` +NOTE: + +There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost. + +When users use Spark to load trainingset/testset in LibSVM format with the following code snippet: + +```scala +spark.read.format("libsvm").load("trainingset_libsvm") +``` + +Spark assumes that the dataset is 1-based indexed. However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is 0-based indexed. It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based before you predict with, for example, Python API. + # Building a ML Pipeline with XGBoost4J-Spark ## Basic ML Pipeline @@ -455,8 +383,94 @@ The model producing the maximum accuracy defined by `MulticlassClassificationEva bestModel.extractParamMap() ``` +# Run XGBoost4J-Spark in Production + + +## Parallel/Distributed Training + +One of the most important parameters we set for XGBoostClassifier is "num_workers" (or "numWorkers"). +This parameter controls how many parallel workers we want to have when training a XGBoostClassificationModel. + +In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task. By default, we allocate a core per each XGBoost worker. +Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved + by running multiple workers (i.e. Spark tasks) at the same time. + + If you do want OpenMP optimization, you have to + + 1. set `nthread` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor + + 2. set `spark.task.cpus` in Spark to the same value as `nthread` + + + +XGBoost4J-Spark has attracted a lot of users from industry and is deployed in many production environments. We also include many features +enabling running XGBoost4J-Spark in production smoothly. + +## Gang Scheduling + +XGBoost uses [AllReduce](http://mpitutorial.com/tutorials/mpi-reduce-and-allreduce/) + to synchronize the stats of each worker. Therefore XGBoost4J-Spark requires that all of `nthread * numWorkers` cores + should be available before the training runs. + +However, in production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark can get +all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more +cores to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. + Additionally, this usually happens silently and does not bring the attention of users. + + XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get + enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, + you can set with XGBoostClassifier: + + ```scala + xgbClassifier.setTimeoutRequestWorkers() + ``` + + or pass in `timeout_request_workers` in xgbParamMap when building XGBoostClassifier + + ```scala + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2, + "timeout_request_workers" -> 60000L) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") + ``` +## Checkpoint During Training +Transient Failures are commonly seen in production environment. To simplify the design of XGBoost, + we stop training if any of the distributed workers fail. Additionally, to efficiently recover failed training, we support + checkpoint mechanism to facilitate failure recovery. + + To enable this feature, you can set how many iterations we build each checkpoint with `setCheckpointInterval` and + the path store checkpointPath with `setCheckpointPath`: + + ```scala + xgbClassifier.setCheckpointInterval(2) + xgbClassifier.setCheckpointPath("/checkpoint_path") + ``` + + an equivalent way is to pass in parameters in XGBoostClassifier's constructor: + + ```scala + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2, + "checkpoint_path" -> "/checkpoints", + "checkpoint_interval" -> 2) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") + ``` +If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file +in `/checkpoints` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. From 5848888d26d337f8b32b6477542aa102736a9e6b Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Sat, 28 Jul 2018 16:47:56 -0700 Subject: [PATCH 13/25] finish the doc --- jvm-packages/xgboost4j-spark/docs/index.md | 86 +++++++++------------- 1 file changed, 33 insertions(+), 53 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/jvm-packages/xgboost4j-spark/docs/index.md index 86994ea8c847..09ec26c457e7 100644 --- a/jvm-packages/xgboost4j-spark/docs/index.md +++ b/jvm-packages/xgboost4j-spark/docs/index.md @@ -309,35 +309,23 @@ The we build the ML `Pipeline` which includes 4 stages: * Use `XGBoostClassifier` to train classification model. * Convert indexed double label back to original string label. -And start to run this `Pipeline` and get a `PipelineModel`: +We have shown the first three steps in the earlier sections, and the last step is finished with a new Transformer IndexToString: ```scala - import org.apache.spark.ml.feature._ - import org.apache.spark.ml.Pipeline - - val assembler = new VectorAssembler() - .setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")) - .setOutputCol("features") - val labelIndexer = new StringIndexer() - .setInputCol("species") - .setOutputCol("label") - .fit(training) - val booster = new XGBoostClassifier( - Map("eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "multi:softprob", - "num_class" -> 3, - "num_round" -> 100, - "num_workers" -> 2 - ) - ) - val labelConverter = new IndexToString() + val labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("realLabel") - .setLabels(labelIndexer.labels) + .setLabels(stringIndexer.labels) +``` + +We need to organize these steps as a `Pipeline` in Spark ML framework and evaluate the whole pipeline to get a `PipelineModel`: +```scala + import org.apache.spark.ml.feature._ + import org.apache.spark.ml.Pipeline + val pipeline = new Pipeline() - .setStages(Array(assembler, labelIndexer, booster, labelConverter)) + .setStages(Array(assembler, stringIndexer, booster, labelConverter)) val model = pipeline.fit(training) ``` @@ -353,9 +341,7 @@ After we get the PipelineModel, we can make prediction on the test dataset and e ## Pipeline with Hyper-parameter Tunning -The most critical operation to maximize the power of XGBoost is to select the optimal parameters for the model. -Tuning parameters manually is a tedious and labor-consuming process. With the latest version of XGBoost4J-Spark, -we can utilize the Spark model selecting tool to automate this process. +The most critical operation to maximize the power of XGBoost is to select the optimal parameters for the model. Tuning parameters manually is a tedious and labor-consuming process. With the latest version of XGBoost4J-Spark, we can utilize the Spark model selecting tool to automate this process. The following example shows the code snippet utilizing `CrossValidation` and `MulticlassClassificationEvaluator` to search the optimal combination of two XGBoost parameters, [`max_depth` and `eta`](https://github.com/dmlc/xgboost/blob/master/doc/parameter.md). @@ -386,13 +372,18 @@ The model producing the maximum accuracy defined by `MulticlassClassificationEva # Run XGBoost4J-Spark in Production +XGBoost4J-Spark is one of the most important steps to bring XGBoost to production environment easier. In this section, we introduce three key features to run XGBoost4J-Spark in production. + ## Parallel/Distributed Training -One of the most important parameters we set for XGBoostClassifier is "num_workers" (or "numWorkers"). +The massive size of training dataset is one of the most significant characteristics in production environment. To ensure that training in XGBoost scales with the data size, XGBoost4J-Spark bridges the distributed/parallel processing framework of Spark and the parallel/distributed training mechanism of XGBoost. + +In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task and the training dataset in Spark's memory space is fed to XGBoost workers in a transparent approach to the user. + +In the code snippet where we build XGBoostClassifier, we set parameter "num_workers" (or "numWorkers"). This parameter controls how many parallel workers we want to have when training a XGBoostClassificationModel. -In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task. By default, we allocate a core per each XGBoost worker. -Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved + By default, we allocate a core per each XGBoost worker. Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved by running multiple workers (i.e. Spark tasks) at the same time. If you do want OpenMP optimization, you have to @@ -400,29 +391,18 @@ Therefore, the OpenMP optimization within each XGBoost worker does not take effe 1. set `nthread` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor 2. set `spark.task.cpus` in Spark to the same value as `nthread` - - - -XGBoost4J-Spark has attracted a lot of users from industry and is deployed in many production environments. We also include many features -enabling running XGBoost4J-Spark in production smoothly. - + ## Gang Scheduling XGBoost uses [AllReduce](http://mpitutorial.com/tutorials/mpi-reduce-and-allreduce/) - to synchronize the stats of each worker. Therefore XGBoost4J-Spark requires that all of `nthread * numWorkers` cores - should be available before the training runs. +algorithm to synchronize the stats, e.g. histogram values, of each worker during training. Therefore XGBoost4J-Spark requires that all of `nthread * numWorkers` cores should be available before the training runs. -However, in production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark can get -all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more -cores to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. - Additionally, this usually happens silently and does not bring the attention of users. +In the production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark application can get all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more resources to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. Additionally, this usually happens silently and does not bring the attention of users. - XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get - enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, - you can set with XGBoostClassifier: +XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, you can set with XGBoostClassifier/XGBoostRegressor: ```scala - xgbClassifier.setTimeoutRequestWorkers() + xgbClassifier.setTimeoutRequestWorkers(60000L) ``` or pass in `timeout_request_workers` in xgbParamMap when building XGBoostClassifier @@ -439,15 +419,16 @@ cores to be available. This process usually brings unnecessary resource waste as setFeaturesCol("features"). setLabelCol("classIndex") ``` + +If XGBoost4J-Spark cannot get enough resources for running two XGBoost workers, the application would fail. Users can have external mechanism to monitor the status of application and get notified for such case. ## Checkpoint During Training -Transient Failures are commonly seen in production environment. To simplify the design of XGBoost, - we stop training if any of the distributed workers fail. Additionally, to efficiently recover failed training, we support - checkpoint mechanism to facilitate failure recovery. +Transient failures are also commonly seen in production environment. To simplify the design of XGBoost, + we stop training if any of the distributed workers fail. However, if the training fails after having been through a long time, it would be a great resource waste on failing. + - To enable this feature, you can set how many iterations we build each checkpoint with `setCheckpointInterval` and - the path store checkpointPath with `setCheckpointPath`: +We support creating checkpoint during training to facilitate more efficient failure recovery. To enable this feature, you can set how many iterations we build each checkpoint with `setCheckpointInterval` and the path store checkpointPath with `setCheckpointPath`: ```scala xgbClassifier.setCheckpointInterval(2) @@ -463,14 +444,13 @@ Transient Failures are commonly seen in production environment. To simplify the "num_class" -> 3, "num_round" -> 100, "num_workers" -> 2, - "checkpoint_path" -> "/checkpoints", + "checkpoint_path" -> "/checkpoints_path", "checkpoint_interval" -> 2) val xgbClassifier = new XGBoostClassifier(xgbParam). setFeaturesCol("features"). setLabelCol("classIndex") ``` -If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file -in `/checkpoints` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. +If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file in `/checkpoints_path` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. From c68f8a1bf7e43d5c892a0b27a0e694e61b24c821 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Sat, 28 Jul 2018 21:11:27 -0700 Subject: [PATCH 14/25] adjust code --- .../example/spark/SparkMLlibPipeline.scala | 10 +- .../example/spark/SparkModelTuningTool.scala | 206 ------------------ .../scala/example/spark/SparkTraining.scala | 78 +++++++ .../example/spark/SparkWithDataFrame.scala | 55 ----- 4 files changed, 84 insertions(+), 265 deletions(-) delete mode 100644 jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkModelTuningTool.scala create mode 100644 jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala delete mode 100644 jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithDataFrame.scala diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala index e9c575b3214b..ea54b457ac04 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala @@ -25,6 +25,8 @@ import org.apache.spark.sql.types._ import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel} +// this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris) + object SparkMLlibPipeline { def main(args: Array[String]): Unit = { @@ -49,7 +51,7 @@ object SparkMLlibPipeline { StructField("sepal width", DoubleType, true), StructField("petal length", DoubleType, true), StructField("petal width", DoubleType, true), - StructField("species", StringType, true))) + StructField("class", StringType, true))) val rawInput = spark.read.schema(schema).csv(inputPath) @@ -65,8 +67,8 @@ object SparkMLlibPipeline { .setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")) .setOutputCol("features") val labelIndexer = new StringIndexer() - .setInputCol("species") - .setOutputCol("label") + .setInputCol("class") + .setOutputCol("classIndex") .fit(training) val booster = new XGBoostClassifier( Map("eta" -> 0.1f, @@ -126,4 +128,4 @@ object SparkMLlibPipeline { val model2 = PipelineModel.load(pipelineModelPath) model2.transform(test).show(false) } -} \ No newline at end of file +} diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkModelTuningTool.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkModelTuningTool.scala deleted file mode 100644 index 0c4a7ce14b7b..000000000000 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkModelTuningTool.scala +++ /dev/null @@ -1,206 +0,0 @@ -/* - Copyright (c) 2014 by Contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - */ - -package ml.dmlc.xgboost4j.scala.example.spark - - -import scala.collection.mutable -import scala.collection.mutable.ListBuffer -import scala.io.Source - -import ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor -import org.apache.spark.ml.Pipeline -import org.apache.spark.ml.evaluation.RegressionEvaluator -import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer} -import org.apache.spark.ml.tuning._ -import org.apache.spark.sql.{Dataset, DataFrame, SparkSession} - -case class SalesRecord(storeId: Int, daysOfWeek: Int, date: String, sales: Int, customers: Int, - open: Int, promo: Int, stateHoliday: String, schoolHoliday: String) - -case class Store(storeId: Int, storeType: String, assortment: String, competitionDistance: Int, - competitionOpenSinceMonth: Int, competitionOpenSinceYear: Int, promo2: Int, - promo2SinceWeek: Int, promo2SinceYear: Int, promoInterval: String) - -object SparkModelTuningTool { - - private def parseStoreFile(storeFilePath: String): List[Store] = { - var isHeader = true - val storeInstances = new ListBuffer[Store] - for (line <- Source.fromFile(storeFilePath).getLines()) { - if (isHeader) { - isHeader = false - } else { - try { - val strArray = line.split(",") - if (strArray.length == 10) { - val Array(storeIdStr, storeTypeStr, assortmentStr, competitionDistanceStr, - competitionOpenSinceMonthStr, competitionOpenSinceYearStr, promo2Str, - promo2SinceWeekStr, promo2SinceYearStr, promoIntervalStr) = line.split(",") - storeInstances += Store(storeIdStr.toInt, storeTypeStr, assortmentStr, - if (competitionDistanceStr == "") -1 else competitionDistanceStr.toInt, - if (competitionOpenSinceMonthStr == "" ) -1 else competitionOpenSinceMonthStr.toInt, - if (competitionOpenSinceYearStr == "" ) -1 else competitionOpenSinceYearStr.toInt, - promo2Str.toInt, - if (promo2Str == "0") -1 else promo2SinceWeekStr.toInt, - if (promo2Str == "0") -1 else promo2SinceYearStr.toInt, - promoIntervalStr.replace("\"", "")) - } else { - val Array(storeIdStr, storeTypeStr, assortmentStr, competitionDistanceStr, - competitionOpenSinceMonthStr, competitionOpenSinceYearStr, promo2Str, - promo2SinceWeekStr, promo2SinceYearStr, firstMonth, secondMonth, thirdMonth, - forthMonth) = line.split(",") - storeInstances += Store(storeIdStr.toInt, storeTypeStr, assortmentStr, - if (competitionDistanceStr == "") -1 else competitionDistanceStr.toInt, - if (competitionOpenSinceMonthStr == "" ) -1 else competitionOpenSinceMonthStr.toInt, - if (competitionOpenSinceYearStr == "" ) -1 else competitionOpenSinceYearStr.toInt, - promo2Str.toInt, - if (promo2Str == "0") -1 else promo2SinceWeekStr.toInt, - if (promo2Str == "0") -1 else promo2SinceYearStr.toInt, - firstMonth.replace("\"", "") + "," + secondMonth + "," + thirdMonth + "," + - forthMonth.replace("\"", "")) - } - } catch { - case e: Exception => - e.printStackTrace() - sys.exit(1) - } - } - } - storeInstances.toList - } - - private def parseTrainingFile(trainingPath: String): List[SalesRecord] = { - var isHeader = true - val records = new ListBuffer[SalesRecord] - for (line <- Source.fromFile(trainingPath).getLines()) { - if (isHeader) { - isHeader = false - } else { - val Array(storeIdStr, daysOfWeekStr, dateStr, salesStr, customerStr, openStr, promoStr, - stateHolidayStr, schoolHolidayStr) = line.split(",") - val salesRecord = SalesRecord(storeIdStr.toInt, daysOfWeekStr.toInt, dateStr, - salesStr.toInt, customerStr.toInt, openStr.toInt, promoStr.toInt, stateHolidayStr, - schoolHolidayStr) - records += salesRecord - } - } - records.toList - } - - private def featureEngineering(ds: DataFrame): DataFrame = { - import org.apache.spark.sql.functions._ - import ds.sparkSession.implicits._ - val stateHolidayIndexer = new StringIndexer() - .setInputCol("stateHoliday") - .setOutputCol("stateHolidayIndex") - val schoolHolidayIndexer = new StringIndexer() - .setInputCol("schoolHoliday") - .setOutputCol("schoolHolidayIndex") - val storeTypeIndexer = new StringIndexer() - .setInputCol("storeType") - .setOutputCol("storeTypeIndex") - val assortmentIndexer = new StringIndexer() - .setInputCol("assortment") - .setOutputCol("assortmentIndex") - val promoInterval = new StringIndexer() - .setInputCol("promoInterval") - .setOutputCol("promoIntervalIndex") - val filteredDS = ds.filter($"sales" > 0).filter($"open" > 0) - // parse date - val dsWithDayCol = - filteredDS.withColumn("day", udf((dateStr: String) => - dateStr.split("-")(2).toInt).apply(col("date"))) - val dsWithMonthCol = - dsWithDayCol.withColumn("month", udf((dateStr: String) => - dateStr.split("-")(1).toInt).apply(col("date"))) - val dsWithYearCol = - dsWithMonthCol.withColumn("year", udf((dateStr: String) => - dateStr.split("-")(0).toInt).apply(col("date"))) - val dsWithLogSales = dsWithYearCol.withColumn("logSales", - udf((sales: Int) => math.log(sales)).apply(col("sales"))) - - // fill with mean values - val meanCompetitionDistance = dsWithLogSales.select(avg("competitionDistance")).first()(0). - asInstanceOf[Double] - println("====" + meanCompetitionDistance) - val finalDS = dsWithLogSales.withColumn("transformedCompetitionDistance", - udf((distance: Int) => if (distance > 0) distance.toDouble else meanCompetitionDistance). - apply(col("competitionDistance"))) - - val vectorAssembler = new VectorAssembler() - .setInputCols(Array("storeId", "daysOfWeek", "promo", "competitionDistance", "promo2", "day", - "month", "year", "transformedCompetitionDistance", "stateHolidayIndex", - "schoolHolidayIndex", "storeTypeIndex", "assortmentIndex", "promoIntervalIndex")) - .setOutputCol("features") - - val pipeline = new Pipeline().setStages( - Array(stateHolidayIndexer, schoolHolidayIndexer, storeTypeIndexer, assortmentIndexer, - promoInterval, vectorAssembler)) - - pipeline.fit(finalDS).transform(finalDS). - drop("stateHoliday", "schoolHoliday", "storeType", "assortment", "promoInterval", "sales", - "promo2SinceWeek", "customers", "promoInterval", "competitionOpenSinceYear", - "competitionOpenSinceMonth", "promo2SinceYear", "competitionDistance", "date") - } - - private def crossValidation( - xgboostParam: Map[String, Any], - trainingData: Dataset[_]): TrainValidationSplitModel = { - val xgbEstimator = new XGBoostRegressor(xgboostParam).setFeaturesCol("features"). - setLabelCol("logSales") - val paramGrid = new ParamGridBuilder() - .addGrid(xgbEstimator.numRound, Array(20, 50)) - .addGrid(xgbEstimator.eta, Array(0.1, 0.4)) - .build() - val tv = new TrainValidationSplit() - .setEstimator(xgbEstimator) - .setEvaluator(new RegressionEvaluator().setLabelCol("logSales")) - .setEstimatorParamMaps(paramGrid) - .setTrainRatio(0.8) // Use 3+ in practice - tv.fit(trainingData) - } - - def main(args: Array[String]): Unit = { - val sparkSession = SparkSession.builder().appName("rosseman").getOrCreate() - import sparkSession.implicits._ - - // parse training file to data frame - val trainingPath = args(0) - val allSalesRecords = parseTrainingFile(trainingPath) - // create dataset - val salesRecordsDF = allSalesRecords.toDF - - // parse store file to data frame - val storeFilePath = args(1) - val allStores = parseStoreFile(storeFilePath) - val storesDS = allStores.toDF() - - val fullDataset = salesRecordsDF.join(storesDS, "storeId") - val featureEngineeredDF = featureEngineering(fullDataset) - // prediction - val params = new mutable.HashMap[String, Any]() - params += "eta" -> 0.1 - params += "max_depth" -> 6 - params += "silent" -> 1 - params += "ntreelimit" -> 1000 - params += "objective" -> "reg:linear" - params += "subsample" -> 0.8 - params += "num_round" -> 100 - - val bestModel = crossValidation(params.toMap, featureEngineeredDF) - } -} diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala new file mode 100644 index 000000000000..13d8ecb1a25d --- /dev/null +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala @@ -0,0 +1,78 @@ +/* + Copyright (c) 2014 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package ml.dmlc.xgboost4j.scala.example.spark + +import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier + +import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} + +// this example works with Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris) +object SparkTraining { + + def main(args: Array[String]): Unit = { + if (args.length < 1) { + // scalastyle:off + println("Usage: program input_path") + sys.exit(1) + } + + val spark = SparkSession.builder().getOrCreate() + val inputPath = args(0) + val schema = new StructType(Array( + StructField("sepal length", DoubleType, true), + StructField("sepal width", DoubleType, true), + StructField("petal length", DoubleType, true), + StructField("petal width", DoubleType, true), + StructField("class", StringType, true))) + val rawInput = spark.read.schema(schema).csv(args(0)) + + // transform class to index to make xgboost happy + val stringIndexer = new StringIndexer() + .setInputCol("class") + .setOutputCol("classIndex") + .fit(rawInput) + val labelTransformed = stringIndexer.transform(rawInput).drop("class") + // compose all feature columns as vector + val vectorAssembler = new VectorAssembler(). + setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")). + setOutputCol("features") + val xgbInput = vectorAssembler.transform(labelTransformed).select("features", + "classIndex") + + /** + * setup "timeout_request_workers" -> 60000L to make this application if it cannot get enough resources + * to get 2 workers within 60000 ms + * + * setup "checkpoint_path" -> "/checkpoints" and "checkpoint_interval" -> 2 to save checkpoint for every + * two iterations + */ + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") + val xgbClassificationModel = xgbClassifier.fit(xgbInput) + val results = xgbClassificationModel.transform(xgbInput) + results.show() + } +} diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithDataFrame.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithDataFrame.scala deleted file mode 100644 index 788850459db7..000000000000 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkWithDataFrame.scala +++ /dev/null @@ -1,55 +0,0 @@ -/* - Copyright (c) 2014 by Contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - */ - -package ml.dmlc.xgboost4j.scala.example.spark - -import ml.dmlc.xgboost4j.scala.Booster -import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier -import org.apache.spark.sql.SparkSession -import org.apache.spark.SparkConf - -object SparkWithDataFrame { - def main(args: Array[String]): Unit = { - if (args.length != 4) { - println( - "usage: program num_of_rounds num_workers training_path test_path") - sys.exit(1) - } - // create SparkSession - val sparkConf = new SparkConf().setAppName("XGBoost-spark-example") - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - sparkConf.registerKryoClasses(Array(classOf[Booster])) - // val sqlContext = new SQLContext(new SparkContext(sparkConf)) - val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate() - // create training and testing dataframes - val numRound = args(0).toInt - val inputTrainPath = args(2) - val inputTestPath = args(3) - // build dataset - val trainDF = sparkSession.sqlContext.read.format("libsvm").load(inputTrainPath) - val testDF = sparkSession.sqlContext.read.format("libsvm").load(inputTestPath) - // start training - val paramMap = List( - "eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "binary:logistic", - "num_round" -> numRound, - "num_workers" -> args(1).toInt).toMap - val xgboostModel = new XGBoostClassifier(paramMap).fit(trainDF) - // xgboost-spark appends the column containing prediction results - xgboostModel.transform(testDF).show() - } -} From 0a686d76fdb7e86c4b8cd9f06ce057f077de8a3d Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Mon, 30 Jul 2018 14:42:02 -0700 Subject: [PATCH 15/25] fix the doc --- jvm-packages/xgboost4j-spark/docs/index.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/jvm-packages/xgboost4j-spark/docs/index.md index 09ec26c457e7..4e0c3f9458ca 100644 --- a/jvm-packages/xgboost4j-spark/docs/index.md +++ b/jvm-packages/xgboost4j-spark/docs/index.md @@ -36,8 +36,8 @@ We also publish some functionalities which would be included in the coming relea ```xml - GitHub Repo - GitHub Repo + XGBoost4J-Spark Snapshot Repo + XGBoost4J-Spark Snapshot Repo https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/ ``` @@ -92,11 +92,11 @@ Spark also contains many built-in readers for other format. The latest version o To make Iris dataset be recognizable to XGBoost, we need to -1. Transform String-typed label, i.e. "class", to Integer-typed label. +1. Transform String-typed label, i.e. "class", to Double-typed label. -2. Assemble the feature columns as a vector to build XGBoost's internal data representation, DMatrix. +2. Assemble the feature columns as a vector to fit to the data interface of Spark ML framework. -To convert String-typed label to Integer, we can use Spark's built-in feature transformer StringIndexer. +To convert String-typed label to Double, we can use Spark's built-in feature transformer StringIndexer. ```scala import org.apache.spark.ml.feature.StringIndexer @@ -110,11 +110,11 @@ To convert String-typed label to Integer, we can use Spark's built-in feature tr With a newly created StringIndexer instance: 1. we set input column, i.e. the column containing String-typed label -2. we set output column, i.e. the column to contain the Integer-typed label. +2. we set output column, i.e. the column to contain the Double-typed label. 3. Then we `fit` StringIndex with our input DataFrame, 'rawInput', so that Spark internals can get information like total number of distinct values, etc. Now we have a StringIndexer which is ready to be applied to our input DataFrame. To execute the transformation logic of StringIndexer, we `transform` the input DataFrame, 'rawInput' and to keep a concise DataFrame, -we drop the column `class` and only keeps the feature columns and the transformed Integer-typed label column (in the last line of the above code snippet). +we drop the column `class` and only keeps the feature columns and the transformed Double-typed label column (in the last line of the above code snippet). `fit` and `transform` are two key operations in MLLIB. Basically, `fit` produces a "transformer", e.g. StringIndexer, and each transformer applies `transform` method on DataFrame to add new column(s) containing transformed features/labels or prediction results, etc. To understand more about `fit` and `transform`, You can find more details in [here](http://spark.apache.org/docs/latest/ml-pipeline.html#pipeline-components). @@ -130,7 +130,7 @@ Similarly, we can use another transformer, 'VectorAssembler', to assemble featur ``` Now, we have a DataFrame containing only two columns, "features" which contains vector-represented -"sepal length", "sepal width", "petal length" and "petal width" and "classIndex" which has Integer-typed +"sepal length", "sepal width", "petal length" and "petal width" and "classIndex" which has Double-typed labels. A DataFrame like this (containing vector-represented features and numeric labels) can be fed to XGBoost4J-Spark's training engine directly. @@ -162,7 +162,7 @@ equivalent form in XGBoost4J-Spark with camel case. For example, to set max_dept val xgbClassifier = new XGBoostClassifier(). setFeaturesCol("features"). setLabelCol("classIndex") - xgbClassifier.setMaxDeltaStep(2) + xgbClassifier.setMaxDepth(2) ``` After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel by fitting XGBoostClassifier with the input DataFrame. This `fit` operation is essentially the training process and the generated model can then be used in Prediction. From 2276a7e530adb45029413c41f8b0440e8ee893bb Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Mon, 30 Jul 2018 20:51:51 -0700 Subject: [PATCH 16/25] use rst --- doc/jvm/index.rst | 1 + .../docs/index.md => doc/jvm/xgboost4j_spark_tutorial.rst | 0 2 files changed, 1 insertion(+) rename jvm-packages/xgboost4j-spark/docs/index.md => doc/jvm/xgboost4j_spark_tutorial.rst (100%) diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst index 9a7cbaaa34fa..18e5e562a179 100644 --- a/doc/jvm/index.rst +++ b/doc/jvm/index.rst @@ -138,6 +138,7 @@ Contents .. toctree:: Java Overview Tutorial + XGBoost4J-Spark Tutorial Code Examples XGBoost4J Java API XGBoost4J Scala API diff --git a/jvm-packages/xgboost4j-spark/docs/index.md b/doc/jvm/xgboost4j_spark_tutorial.rst similarity index 100% rename from jvm-packages/xgboost4j-spark/docs/index.md rename to doc/jvm/xgboost4j_spark_tutorial.rst From 9bce7e7c21e03c0030b7ccf692d2a1ee3fc9a69d Mon Sep 17 00:00:00 2001 From: Philip Cho Date: Tue, 31 Jul 2018 20:29:35 -0700 Subject: [PATCH 17/25] Convert XGBoost4J-Spark tutorial to reST --- doc/jvm/xgboost4j_spark_tutorial.rst | 621 ++++++++++++++------------- doc/tutorials/index.rst | 3 +- 2 files changed, 321 insertions(+), 303 deletions(-) diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index 4e0c3f9458ca..f185fb2c4593 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -1,456 +1,473 @@ -# XGBoost4J-Spark Tutorial (version >= 0.8) +####################################### +XGBoost4J-Spark Tutorial (version 0.8+) +####################################### -XGBoost4J-Spark is a project aiming to seamlessly integrate XGBoost and Apache Spark by fitting XGBoost to Apache Spark's MLLIB framework. With the integration, user can not only uses the high-performant algorithm implementation of XGBoost, but also leverages the powerful data processing engine of Spark for: +**XGBoost4J-Spark** is a project aiming to seamlessly integrate XGBoost and Apache Spark by fitting XGBoost to Apache Spark's MLLIB framework. With the integration, user can not only uses the high-performant algorithm implementation of XGBoost, but also leverages the powerful data processing engine of Spark for: - * Feature Engineering: feature extraction, transformation, dimensionality reduction, and selection, etc. - * Pipelines: constructing, evaluating, and tuning ML Pipelines - * Persistence: persist and load machine learning models and even whole Pipelines +* Feature Engineering: feature extraction, transformation, dimensionality reduction, and selection, etc. +* Pipelines: constructing, evaluating, and tuning ML Pipelines +* Persistence: persist and load machine learning models and even whole Pipelines This tutorial is to cover the end-to-end process to build a machine learning pipeline with XGBoost4J-Spark. We will discuss - * Using Spark to preprocess data to fit to XGBoost/XGBoost4J-Spark's data interface - * Training a XGBoost model with XGBoost4J-Spark - * Serving XGBoost model (prediction) with Spark - * Building a Machine Learning Pipeline with XGBoost4J-Spark - * Running XGBoost4J-Spark in Production - -# Build an ML Application with XGBoost4J-Spark +* Using Spark to preprocess data to fit to XGBoost/XGBoost4J-Spark's data interface +* Training a XGBoost model with XGBoost4J-Spark +* Serving XGBoost model (prediction) with Spark +* Building a Machine Learning Pipeline with XGBoost4J-Spark +* Running XGBoost4J-Spark in Production + +.. contents:: + :backlinks: none + :local: -## Refer to XGBoost4J-Spark Dependency +******************************************** +Build an ML Application with XGBoost4J-Spark +******************************************** + +Refer to XGBoost4J-Spark Dependency +=================================== Before we go into the tour of how to use XGBoost4J-Spark, we would bring a brief introduction about how to build a machine learning application with XGBoost4J-Spark. The first thing you need to do is to refer to the dependency in Maven Central. -You can add the following dependency in your pom file. +You can add the following dependency in your ``pom.xml``. + +.. code-block:: xml -```xml - + ml.dmlc xgboost4j-spark latest_version_num - -``` + -For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases). +For the latest release version number, please check `here `_. -We also publish some functionalities which would be included in the coming release in the form of snapshot version. To access these functionalities, you can add dependency to the snapshot artifacts. We publish snapshot version in github-based repo, so you can add the following repo in pom.xml: +We also publish some functionalities which would be included in the coming release in the form of snapshot version. To access these functionalities, you can add dependency to the snapshot artifacts. We publish snapshot version in github-based repo, so you can add the following repo in ``pom.xml``: -```xml - - XGBoost4J-Spark Snapshot Repo - XGBoost4J-Spark Snapshot Repo - https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/ - -``` +.. code-block:: xml + + + XGBoost4J-Spark Snapshot Repo + XGBoost4J-Spark Snapshot Repo + https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/ + and then refer to the snapshot dependency by adding: -```xml - - ml.dmlc - xgboost4j - next_version_num-SNAPSHOT - -``` +.. code-block:: xml + + ml.dmlc + xgboost4j + next_version_num-SNAPSHOT + -## Data Preparation +Data Preparation +================ As aforementioned, XGBoost4J-Spark seamlessly integrates Spark and XGBoost. The integration enables - users to apply various types of transformation over the training/test datasets with the convenient - and powerful data processing framework, Spark. +users to apply various types of transformation over the training/test datasets with the convenient +and powerful data processing framework, Spark. -In this section, we use [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset as an example to - showcase how we use Spark to transform raw dataset and make it fit to the data interface of XGBoost. +In this section, we use `Iris `_ dataset as an example to +showcase how we use Spark to transform raw dataset and make it fit to the data interface of XGBoost. Iris dataset is shipped in CSV format. Each instance contains 4 features, "sepal length", "sepal width", -"petal length" and "petal width". "class" column in each instance is essentially the label which has three possible values: "Iris Setosa", "Iris Versicolour" and "Iris Virginica". +"petal length" and "petal width". In addition, it contains the "class" columnm, which is essentially the label with three possible values: "Iris Setosa", "Iris Versicolour" and "Iris Virginica". -### Read Dataset with Spark's Built-In Reader +Read Dataset with Spark's Built-In Reader +----------------------------------------- -The first thing in data transformation is to load the dataset as Spark's structured data abstraction, -DataFrame. +The first thing in data transformation is to load the dataset as Spark's structured data abstraction, DataFrame. -```scala - import org.apache.spark.sql.SparkSession - import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} - - val spark = SparkSession.builder().getOrCreate() - val schema = new StructType(Array( - StructField("sepal length", DoubleType, true), - StructField("sepal width", DoubleType, true), - StructField("petal length", DoubleType, true), - StructField("petal width", DoubleType, true), - StructField("class", StringType, true))) - val rawInput = spark.read.schema(schema).csv("input_path") -``` - -At the first line, we create a instance of [SparkSession](http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession) which is the entry of any Spark program working with DataFrame. The `schema` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types, otherwise the column name would be the default ones derived by Spark, such as `_col0`, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named `rawInput`. - -Spark also contains many built-in readers for other format. The latest version of Spark supports, csv/json/parquet/libsvm. +.. code-block:: scala + + import org.apache.spark.sql.SparkSession + import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} + + val spark = SparkSession.builder().getOrCreate() + val schema = new StructType(Array( + StructField("sepal length", DoubleType, true), + StructField("sepal width", DoubleType, true), + StructField("petal length", DoubleType, true), + StructField("petal width", DoubleType, true), + StructField("class", StringType, true))) + val rawInput = spark.read.schema(schema).csv("input_path") + +At the first line, we create a instance of `SparkSession `_ which is the entry of any Spark program working with DataFrame. The ``schema`` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types; otherwise the column name would be the default ones derived by Spark, such as ``_col0``, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named ``rawInput``. + +Spark also contains many built-in readers for other format. The latest version of Spark supports CSV, JSON, Parquet, and LIBSVM. -### Transform Raw Iris Dataset +Transform Raw Iris Dataset +-------------------------- To make Iris dataset be recognizable to XGBoost, we need to 1. Transform String-typed label, i.e. "class", to Double-typed label. - 2. Assemble the feature columns as a vector to fit to the data interface of Spark ML framework. -To convert String-typed label to Double, we can use Spark's built-in feature transformer StringIndexer. +To convert String-typed label to Double, we can use Spark's built-in feature transformer `StringIndexer `_. -```scala - import org.apache.spark.ml.feature.StringIndexer - val stringIndexer = new StringIndexer(). - setInputCol("class"). - setOutputCol("classIndex"). - fit(rawInput) - val labelTransformed = stringIndexer.transform(rawInput).drop("class") -``` +.. code-block:: scala + + import org.apache.spark.ml.feature.StringIndexer + val stringIndexer = new StringIndexer(). + setInputCol("class"). + setOutputCol("classIndex"). + fit(rawInput) + val labelTransformed = stringIndexer.transform(rawInput).drop("class") With a newly created StringIndexer instance: 1. we set input column, i.e. the column containing String-typed label 2. we set output column, i.e. the column to contain the Double-typed label. -3. Then we `fit` StringIndex with our input DataFrame, 'rawInput', so that Spark internals can get information like total number of distinct values, etc. +3. Then we ``fit`` StringIndex with our input DataFrame ``rawInput``, so that Spark internals can get information like total number of distinct values, etc. -Now we have a StringIndexer which is ready to be applied to our input DataFrame. To execute the transformation logic of StringIndexer, we `transform` the input DataFrame, 'rawInput' and to keep a concise DataFrame, -we drop the column `class` and only keeps the feature columns and the transformed Double-typed label column (in the last line of the above code snippet). +Now we have a StringIndexer which is ready to be applied to our input DataFrame. To execute the transformation logic of StringIndexer, we ``transform`` the input DataFrame ``rawInput`` and to keep a concise DataFrame, +we drop the column "class" and only keeps the feature columns and the transformed Double-typed label column (in the last line of the above code snippet). -`fit` and `transform` are two key operations in MLLIB. Basically, `fit` produces a "transformer", e.g. StringIndexer, and each transformer applies `transform` method on DataFrame to add new column(s) containing transformed features/labels or prediction results, etc. To understand more about `fit` and `transform`, You can find more details in [here](http://spark.apache.org/docs/latest/ml-pipeline.html#pipeline-components). +The ``fit`` and ``transform`` are two key operations in MLLIB. Basically, ``fit`` produces a "transformer", e.g. StringIndexer, and each transformer applies ``transform`` method on DataFrame to add new column(s) containing transformed features/labels or prediction results, etc. To understand more about ``fit`` and ``transform``, You can find more details in `here `_. -Similarly, we can use another transformer, 'VectorAssembler', to assemble feature columns "sepal length", "sepal width", "petal length" and "petal width" as a vector. +Similarly, we can use another transformer, `VectorAssembler `_, to assemble feature columns "sepal length", "sepal width", "petal length" and "petal width" as a vector. + +.. code-block:: scala -```scala - import org.apache.spark.ml.feature.VectorAssembler - val vectorAssembler = new VectorAssembler(). - setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")). - setOutputCol("features") - val xgbInput = vectorAssembler.transform(labelTransformed).select("features", - "classIndex") -``` + import org.apache.spark.ml.feature.VectorAssembler + val vectorAssembler = new VectorAssembler(). + setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")). + setOutputCol("features") + val xgbInput = vectorAssembler.transform(labelTransformed).select("features", "classIndex") Now, we have a DataFrame containing only two columns, "features" which contains vector-represented "sepal length", "sepal width", "petal length" and "petal width" and "classIndex" which has Double-typed labels. A DataFrame like this (containing vector-represented features and numeric labels) can be fed to XGBoost4J-Spark's training engine directly. - -## Training +Training +======== -XGBoost support both Regression and Classification. While we use Iris dataset in this tutorial to show how we use XGBoost/XGBoost4J-Spark to resolve a multi-classes Classification problem, The usage in Regression is very similar to Classification. +XGBoost supports both regression and classification. While we use Iris dataset in this tutorial to show how we use XGBoost/XGBoost4J-Spark to resolve a multi-classes classification problem, the usage in Regression is very similar to classification. To train a XGBoost model for classification, we need to claim a XGBoostClassifier first: -```scala - import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier - val xgbParam = Map("eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "multi:softprob", - "num_class" -> 3, - "num_round" -> 100, - "num_workers" -> 2) - val xgbClassifier = new XGBoostClassifier(xgbParam). - setFeaturesCol("features"). - setLabelCol("classIndex") -``` - -The available parameters for training a XGBoost model can be found in [here](https://xgboost.readthedocs.io/en/latest/parameter.html). In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case-variance of these parameters to keep consistent with Spark's MLLIB parameters. - -Specifically, each parameter in [here](https://xgboost.readthedocs.io/en/latest/parameter.html) has its -equivalent form in XGBoost4J-Spark with camel case. For example, to set max_depth for each tree, you can pass parameter just like what we did in the above code snippet (as `max_depth` wrapped in a Map), or you can do it through setters in XGBoostClassifer: +.. code-block:: scala + + import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") + +The available parameters for training a XGBoost model can be found in :doc:`here `. In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case variant of these parameters to keep consistent with Spark's MLLIB parameters. + +Specifically, each parameter in :doc:`this page ` has its +equivalent form in XGBoost4J-Spark with camel case. For example, to set ``max_depth`` for each tree, you can pass parameter just like what we did in the above code snippet (as ``max_depth`` wrapped in a Map), or you can do it through setters in XGBoostClassifer: - ```scala - val xgbClassifier = new XGBoostClassifier(). - setFeaturesCol("features"). - setLabelCol("classIndex") - xgbClassifier.setMaxDepth(2) - ``` +.. code-block:: scala + + val xgbClassifier = new XGBoostClassifier(). + setFeaturesCol("features"). + setLabelCol("classIndex") + xgbClassifier.setMaxDepth(2) -After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel by fitting XGBoostClassifier with the input DataFrame. This `fit` operation is essentially the training process and the generated model can then be used in Prediction. +After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel by fitting XGBoostClassifier with the input DataFrame. This ``fit`` operation is essentially the training process and the generated model can then be used in prediction. -```scala - val xgbClassificationModel = xgbClassifier.fit(xgbInput) -``` +.. code-block:: scala -## Prediction + val xgbClassificationModel = xgbClassifier.fit(xgbInput) + +Prediction +========== XGBoost4j-Spark supports two ways for model serving: batch prediction and single instance prediction. -### Batch Prediction +Batch Prediction +---------------- + +When we get a model, either XGBoostClassificationModel or XGBoostRegressionModel, it takes a DataFrame, read the column containing feature vectors, predict for each feature vector, and output a new DataFrame with the following columns by default: -When we get a model, either `XGBoostClassificationModel` or `XGBoostRegressionModel`, it takes a DataFrame, read the column containing feature vectors, predict for each feature vector, and output a new DataFrame with the following columns by default: +* XGBoostClassificationModel will output margins (``rawPredictionCol``), probabilities(``probabilityCol``) and the eventual prediction labels (``predictionCol``) for each possible label. +* XGBoostRegressionModel will output prediction label(``predictionCol``). -* `XGBoostClassificationModel` will output margins (`rawPredictionCol`), probabilities(`probabilityCol`) and the eventual prediction labels (`predictionCol`) for each possible label. -* `XGBoostRegressionModel` will output prediction label(`predictionCol`). +Batch prediction expects the user to pass the testset in the form of a DataFrame. XGBoost4J-Spark starts a XGBoost worker for each partition of DataFrame for parallel prediction and generates prediction results for the whole DataFrame in a batch. -Batch Prediction expects the user to pass the testset in the form of a DataFrame. XGBoost4J-Spark starts a XGBoost worker for each partition of DataFrame for parallel prediction and generates prediction results for the whole DataFrame in a batch. +.. code-block:: scala -```scala - val xgbClassificationModel = xgbClassifier.fit(xgbInput) - val results = xgbClassificationModel.transform(testSet) -``` + val xgbClassificationModel = xgbClassifier.fit(xgbInput) + val results = xgbClassificationModel.transform(testSet) With the above code snippet, we get a result DataFrame, result containing margin, probability for each class and the prediction for each instance -```scala -+-----------------+----------+--------------------+--------------------+----------+ -| features|classIndex| rawPrediction| probability|prediction| -+-----------------+----------+--------------------+--------------------+----------+ -|[5.1,3.5,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[4.9,3.0,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[4.7,3.2,1.3,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| -|[4.6,3.1,1.5,0.2]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| -|[5.0,3.6,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[5.4,3.9,1.7,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| -|[4.6,3.4,1.4,0.3]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| -|[5.0,3.4,1.5,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[4.4,2.9,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[4.9,3.1,1.5,0.1]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| -|[5.4,3.7,1.5,0.2]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| -|[4.8,3.4,1.6,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| -|[4.8,3.0,1.4,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[4.3,3.0,1.1,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[5.8,4.0,1.2,0.2]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| -|[5.7,4.4,1.5,0.4]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| -|[5.4,3.9,1.3,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| -|[5.1,3.5,1.4,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[5.7,3.8,1.7,0.3]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| -|[5.1,3.8,1.5,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -+-----------------+----------+--------------------+--------------------+----------+ - -``` - -### Single instance prediction - -`XGBoostClassificationModel` or `XGBoostRegressionModel` support make prediction on single instance as well. +.. code-block:: none + + +-----------------+----------+--------------------+--------------------+----------+ + | features|classIndex| rawPrediction| probability|prediction| + +-----------------+----------+--------------------+--------------------+----------+ + |[5.1,3.5,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| + |[4.9,3.0,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| + |[4.7,3.2,1.3,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| + |[4.6,3.1,1.5,0.2]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| + |[5.0,3.6,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| + |[5.4,3.9,1.7,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| + |[4.6,3.4,1.4,0.3]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| + |[5.0,3.4,1.5,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| + |[4.4,2.9,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| + |[4.9,3.1,1.5,0.1]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| + |[5.4,3.7,1.5,0.2]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| + |[4.8,3.4,1.6,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| + |[4.8,3.0,1.4,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| + |[4.3,3.0,1.1,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| + |[5.8,4.0,1.2,0.2]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| + |[5.7,4.4,1.5,0.4]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| + |[5.4,3.9,1.3,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| + |[5.1,3.5,1.4,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| + |[5.7,3.8,1.7,0.3]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| + |[5.1,3.8,1.5,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| + +-----------------+----------+--------------------+--------------------+----------+ + +Single instance prediction +-------------------------- + +XGBoostClassificationModel or XGBoostRegressionModel support make prediction on single instance as well. It accepts a single Vector as feature, and output the prediction label. However, the overhead of single-instance prediction is high due to the internal overhead of XGBoost, use it carefully! -```scala - val features = xgbInput.head().getAs[Vector]("features") - val result = xgbClassificationModel.predict(features) -``` +.. code-block:: scala + + val features = xgbInput.head().getAs[Vector]("features") + val result = xgbClassificationModel.predict(features) -## Model Persistence +Model Persistence +================= -### Model and pipeline persistence +Model and pipeline persistence +------------------------------ A data scientist produces an ML model and hands it over to an engineering team for deployment in a production environment. Reversely, a trained model may be used by data scientists, for example as a baseline, across the process of data exploration. So it's important to support model persistence to make the models available across usage scenarios and programming languages. -XGBoost4j-Spark supports save/load `XGBoostClassifier`/`XGBoostClassificationModel` and `XGBoostRegressor`/`XGBoostRegressionModel`, it also support save/load a ML pipeline which includes these estimators and models. +XGBoost4j-Spark supports saving and loading XGBoostClassifier/XGBoostClassificationModel and XGBoostRegressor/XGBoostRegressionModel. It also supports saving and loading a ML pipeline which includes these estimators and models. We can save the XGBoostClassificationModel to file system: -```scala - val xgbClassificationModelPath = "/tmp/xgbClassificationModel" - xgbClassificationModel.write.overwrite().save(xgbClassificationModelPath) -``` +.. code-block:: scala + + val xgbClassificationModelPath = "/tmp/xgbClassificationModel" + xgbClassificationModel.write.overwrite().save(xgbClassificationModelPath) and then loading the model in another session: -```scala - import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel +.. code-block:: scala + + import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel - val xgbClassificationModel2 = XGBoostClassificationModel.load(xgbClassificationModelPath) - xgbClassificationModel2.transform(xgbInput) -``` + val xgbClassificationModel2 = XGBoostClassificationModel.load(xgbClassificationModelPath) + xgbClassificationModel2.transform(xgbInput) With regards to ML pipeline save and load, please refer the next section. -### Export for Other Bindings of XGBoost - +Export for Other Bindings of XGBoost +------------------------------------ After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by: -```scala - val nativeModelPath = "/tmp/nativeModel" - xgbClassificationModel.nativeBooster.saveModel(nativeModelPath) -``` +.. code-block:: scala -Then we can load this model with single node Python XGBoost: + val nativeModelPath = "/tmp/nativeModel" + xgbClassificationModel.nativeBooster.saveModel(nativeModelPath) -```python - import xgboost as xgb - bst = xgb.Booster({'nthread': 4}) - bst.load_model(nativeModelPath) -``` - -NOTE: +Then we can load this model with single node Python XGBoost: -There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost. +.. code-block:: python -When users use Spark to load trainingset/testset in LibSVM format with the following code snippet: + import xgboost as xgb + bst = xgb.Booster({'nthread': 4}) + bst.load_model(nativeModelPath) -```scala -spark.read.format("libsvm").load("trainingset_libsvm") -``` +.. note:: Consistency issue between XGBoost4J-Spark and other bindings -Spark assumes that the dataset is 1-based indexed. However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is 0-based indexed. It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based before you predict with, for example, Python API. + There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost. + + When users use Spark to load training/test data in LIBSVM format with the following code snippet: + + .. code-block:: scala + + spark.read.format("libsvm").load("trainingset_libsvm") + + Spark assumes that the dataset is using 1-based indexing (feature indices staring with 1). However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is using 0-based indexing (feature indices starting with 0). It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based indexing before you predict with, for example, Python API. -# Building a ML Pipeline with XGBoost4J-Spark +******************************************* +Building a ML Pipeline with XGBoost4J-Spark +******************************************* -## Basic ML Pipeline +Basic ML Pipeline +================= Spark ML pipeline can combine multiple algorithms or functions into a single pipeline. -It covers from feature extraction/transformation/selection to model training/prediction. +It covers from feature extraction, transformation, selection to model training and prediction. XGBoost4j-Spark makes it feasible to embed XGBoost into such a pipeline seamlessly. The following example shows how to build such a pipeline consisting of Spark MLlib feature transformer and XGBoostClassifier estimator. -We still use [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset and the ```rawInput``` DataFrame. +We still use `Iris `_ dataset and the ``rawInput`` DataFrame. First we need to split the dataset into training and test dataset. -```scala - val Array(training, test) = rawInput.randomSplit(Array(0.8, 0.2), 123) -``` +.. code-block:: scala + + val Array(training, test) = rawInput.randomSplit(Array(0.8, 0.2), 123) + +The we build the ML pipeline which includes 4 stages: -The we build the ML `Pipeline` which includes 4 stages: * Assemble all features into a single vector column. * From string label to indexed double label. -* Use `XGBoostClassifier` to train classification model. +* Use XGBoostClassifier to train classification model. * Convert indexed double label back to original string label. -We have shown the first three steps in the earlier sections, and the last step is finished with a new Transformer IndexToString: +We have shown the first three steps in the earlier sections, and the last step is finished with a new transformer `IndexToString `_: + +.. code-block:: scala -```scala val labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("realLabel") .setLabels(stringIndexer.labels) -``` -We need to organize these steps as a `Pipeline` in Spark ML framework and evaluate the whole pipeline to get a `PipelineModel`: +We need to organize these steps as a Pipeline in Spark ML framework and evaluate the whole pipeline to get a PipelineModel: -```scala - import org.apache.spark.ml.feature._ - import org.apache.spark.ml.Pipeline - - val pipeline = new Pipeline() - .setStages(Array(assembler, stringIndexer, booster, labelConverter)) - val model = pipeline.fit(training) -``` +.. code-block:: scala + + import org.apache.spark.ml.feature._ + import org.apache.spark.ml.Pipeline + + val pipeline = new Pipeline() + .setStages(Array(assembler, stringIndexer, booster, labelConverter)) + val model = pipeline.fit(training) After we get the PipelineModel, we can make prediction on the test dataset and evaluate the model accuracy. -```scala - import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator - - val prediction = model.transform(test) - val evaluator = new MulticlassClassificationEvaluator() - val accuracy = evaluator.evaluate(prediction) -``` +.. code-block:: scala -## Pipeline with Hyper-parameter Tunning + import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator + + val prediction = model.transform(test) + val evaluator = new MulticlassClassificationEvaluator() + val accuracy = evaluator.evaluate(prediction) +Pipeline with Hyper-parameter Tunning +===================================== The most critical operation to maximize the power of XGBoost is to select the optimal parameters for the model. Tuning parameters manually is a tedious and labor-consuming process. With the latest version of XGBoost4J-Spark, we can utilize the Spark model selecting tool to automate this process. -The following example shows the code snippet utilizing `CrossValidation` and `MulticlassClassificationEvaluator` -to search the optimal combination of two XGBoost parameters, [`max_depth` and `eta`](https://github.com/dmlc/xgboost/blob/master/doc/parameter.md). -The model producing the maximum accuracy defined by `MulticlassClassificationEvaluator` is selected and used to generate the prediction for the test set. - -```scala - import org.apache.spark.ml.tuning._ - import org.apache.spark.ml.PipelineModel - import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel - - val paramGrid = new ParamGridBuilder() - .addGrid(booster.maxDepth, Array(3, 8)) - .addGrid(booster.eta, Array(0.2, 0.6)) - .build() - val cv = new CrossValidator() - .setEstimator(pipeline) - .setEvaluator(evaluator) - .setEstimatorParamMaps(paramGrid) - .setNumFolds(3) - - val cvModel = cv.fit(training) +The following example shows the code snippet utilizing CrossValidation and MulticlassClassificationEvaluator +to search the optimal combination of two XGBoost parameters, ``max_depth`` and ``eta``. (See :doc:`/parameter`.) +The model producing the maximum accuracy defined by MulticlassClassificationEvaluator is selected and used to generate the prediction for the test set. - val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2) - .asInstanceOf[XGBoostClassificationModel] - bestModel.extractParamMap() -``` - -# Run XGBoost4J-Spark in Production +.. code-block:: scala + import org.apache.spark.ml.tuning._ + import org.apache.spark.ml.PipelineModel + import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel + + val paramGrid = new ParamGridBuilder() + .addGrid(booster.maxDepth, Array(3, 8)) + .addGrid(booster.eta, Array(0.2, 0.6)) + .build() + val cv = new CrossValidator() + .setEstimator(pipeline) + .setEvaluator(evaluator) + .setEstimatorParamMaps(paramGrid) + .setNumFolds(3) + + val cvModel = cv.fit(training) + + val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2) + .asInstanceOf[XGBoostClassificationModel] + bestModel.extractParamMap() + +********************************* +Run XGBoost4J-Spark in Production +********************************* XGBoost4J-Spark is one of the most important steps to bring XGBoost to production environment easier. In this section, we introduce three key features to run XGBoost4J-Spark in production. -## Parallel/Distributed Training - +Parallel/Distributed Training +============================= The massive size of training dataset is one of the most significant characteristics in production environment. To ensure that training in XGBoost scales with the data size, XGBoost4J-Spark bridges the distributed/parallel processing framework of Spark and the parallel/distributed training mechanism of XGBoost. In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task and the training dataset in Spark's memory space is fed to XGBoost workers in a transparent approach to the user. -In the code snippet where we build XGBoostClassifier, we set parameter "num_workers" (or "numWorkers"). +In the code snippet where we build XGBoostClassifier, we set parameter ``num_workers`` (or ``numWorkers``). This parameter controls how many parallel workers we want to have when training a XGBoostClassificationModel. - By default, we allocate a core per each XGBoost worker. Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved - by running multiple workers (i.e. Spark tasks) at the same time. - - If you do want OpenMP optimization, you have to - - 1. set `nthread` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor - - 2. set `spark.task.cpus` in Spark to the same value as `nthread` - -## Gang Scheduling +.. note:: Regarding OpenMP optimization -XGBoost uses [AllReduce](http://mpitutorial.com/tutorials/mpi-reduce-and-allreduce/) -algorithm to synchronize the stats, e.g. histogram values, of each worker during training. Therefore XGBoost4J-Spark requires that all of `nthread * numWorkers` cores should be available before the training runs. + By default, we allocate a core per each XGBoost worker. Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved + by running multiple workers (i.e. Spark tasks) at the same time. + + If you do want OpenMP optimization, you have to + + 1. set ``nthread`` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor + 2. set ``spark.task.cpus`` in Spark to the same value as ``nthread`` + +Gang Scheduling +=============== +XGBoost uses `AllReduce `_. +algorithm to synchronize the stats, e.g. histogram values, of each worker during training. Therefore XGBoost4J-Spark requires that all of ``nthread * numWorkers`` cores should be available before the training runs. In the production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark application can get all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more resources to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. Additionally, this usually happens silently and does not bring the attention of users. XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, you can set with XGBoostClassifier/XGBoostRegressor: - ```scala - xgbClassifier.setTimeoutRequestWorkers(60000L) - ``` +.. code-block:: scala + + xgbClassifier.setTimeoutRequestWorkers(60000L) - or pass in `timeout_request_workers` in xgbParamMap when building XGBoostClassifier +or pass in ``timeout_request_workers`` in ``xgbParamMap`` when building XGBoostClassifier: - ```scala - val xgbParam = Map("eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "multi:softprob", - "num_class" -> 3, - "num_round" -> 100, - "num_workers" -> 2, - "timeout_request_workers" -> 60000L) - val xgbClassifier = new XGBoostClassifier(xgbParam). - setFeaturesCol("features"). - setLabelCol("classIndex") - ``` +.. code-block:: scala + + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2, + "timeout_request_workers" -> 60000L) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") If XGBoost4J-Spark cannot get enough resources for running two XGBoost workers, the application would fail. Users can have external mechanism to monitor the status of application and get notified for such case. -## Checkpoint During Training +Checkpoint During Training +========================== Transient failures are also commonly seen in production environment. To simplify the design of XGBoost, - we stop training if any of the distributed workers fail. However, if the training fails after having been through a long time, it would be a great resource waste on failing. +we stop training if any of the distributed workers fail. However, if the training fails after having been through a long time, it would be a great waste of resources. +We support creating checkpoint during training to facilitate more efficient recovery from failture. To enable this feature, you can set how many iterations we build each checkpoint with ``setCheckpointInterval`` and the location of checkpoints with ``setCheckpointPath``: -We support creating checkpoint during training to facilitate more efficient failure recovery. To enable this feature, you can set how many iterations we build each checkpoint with `setCheckpointInterval` and the path store checkpointPath with `setCheckpointPath`: - - ```scala - xgbClassifier.setCheckpointInterval(2) - xgbClassifier.setCheckpointPath("/checkpoint_path") - ``` +.. code-block:: scala + + xgbClassifier.setCheckpointInterval(2) + xgbClassifier.setCheckpointPath("/checkpoint_path") - an equivalent way is to pass in parameters in XGBoostClassifier's constructor: +An equivalent way is to pass in parameters in XGBoostClassifier's constructor: - ```scala - val xgbParam = Map("eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "multi:softprob", - "num_class" -> 3, - "num_round" -> 100, - "num_workers" -> 2, - "checkpoint_path" -> "/checkpoints_path", - "checkpoint_interval" -> 2) - val xgbClassifier = new XGBoostClassifier(xgbParam). - setFeaturesCol("features"). - setLabelCol("classIndex") - ``` - -If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file in `/checkpoints_path` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. - +.. code-block:: scala + + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2, + "checkpoint_path" -> "/checkpoints_path", + "checkpoint_interval" -> 2) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") + +If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file in ``/checkpoints_path`` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst index 77fbfd001818..db48d9fbf84a 100644 --- a/doc/tutorials/index.rst +++ b/doc/tutorials/index.rst @@ -10,7 +10,8 @@ See `Awesome XGBoost `_ for mo :caption: Contents: model - aws_yarn + Distributed XGBoost with AWS YARN + Distributed XGBoost with XGBoost4J-Spark dart monotonic input_format From 341a470b48aeaac228670483162a5424612ba92b Mon Sep 17 00:00:00 2001 From: Philip Cho Date: Tue, 31 Jul 2018 20:49:43 -0700 Subject: [PATCH 18/25] Bring XGBoost4J up to date --- doc/jvm/index.rst | 3 +- doc/jvm/java_intro.rst | 98 ++++++++++++++++-------------------------- 2 files changed, 40 insertions(+), 61 deletions(-) diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst index 18e5e562a179..7812770bcdb8 100644 --- a/doc/jvm/index.rst +++ b/doc/jvm/index.rst @@ -136,8 +136,9 @@ Contents ******** .. toctree:: + :maxdepth: 2 - Java Overview Tutorial + java_intro XGBoost4J-Spark Tutorial Code Examples XGBoost4J Java API diff --git a/doc/jvm/java_intro.rst b/doc/jvm/java_intro.rst index 908f1b50fea9..c36b489ab36c 100644 --- a/doc/jvm/java_intro.rst +++ b/doc/jvm/java_intro.rst @@ -1,28 +1,28 @@ -################## -XGBoost4J Java API -################## +############################## +Getting Started with XGBoost4J +############################## This tutorial introduces Java API for XGBoost. ************** Data Interface ************** -Like the XGBoost python module, XGBoost4J uses ``DMatrix`` to handle data, -libsvm txt format file, sparse matrix in CSR/CSC format, and dense matrix is +Like the XGBoost python module, XGBoost4J uses DMatrix to handle data, +LIBSVM txt format file, sparse matrix in CSR/CSC format, and dense matrix is supported. -* The first step is to import ``DMatrix``: +* The first step is to import DMatrix: .. code-block:: java - import org.dmlc.xgboost4j.DMatrix; + import org.dmlc.xgboost4j.java.DMatrix; -* Use ``DMatrix`` constructor to load data from a libsvm text format file: +* Use DMatrix constructor to load data from a libsvm text format file: .. code-block:: java DMatrix dmat = new DMatrix("train.svm.txt"); -* Pass arrays to ``DMatrix`` constructor to load from sparse matrix. +* Pass arrays to DMatrix constructor to load from sparse matrix. Suppose we have a sparse matrix @@ -78,47 +78,31 @@ supported. ****************** Setting Parameters ****************** -* In XGBoost4J any ``Iterable>`` object could be used as parameters. +To set parameters, parameters are specified as a Map: -* To set parameters, for non-multiple value params, you can simply use entrySet of an Map: - - .. code-block:: java - - Map paramMap = new HashMap<>() { - { - put("eta", 1.0); - put("max_depth", 2); - put("silent", 1); - put("objective", "binary:logistic"); - put("eval_metric", "logloss"); - } - }; - Iterable> params = paramMap.entrySet(); - -* for the situation that multiple values with same param key, List> would be a good choice, e.g. : - - .. code-block:: java +.. code-block:: java - List> params = new ArrayList>() { - { - add(new SimpleEntry("eta", 1.0)); - add(new SimpleEntry("max_depth", 2.0)); - add(new SimpleEntry("silent", 1)); - add(new SimpleEntry("objective", "binary:logistic")); - } - }; + Map params = new HashMap<>() { + { + put("eta", 1.0); + put("max_depth", 2); + put("silent", 1); + put("objective", "binary:logistic"); + put("eval_metric", "logloss"); + } + }; ************** Training Model ************** With parameters and data, you are able to train a booster model. -* Import ``Trainer`` and ``Booster``: +* Import Booster and XGBoost: .. code-block:: java - import org.dmlc.xgboost4j.Booster; - import org.dmlc.xgboost4j.util.Trainer; + import org.dmlc.xgboost4j.java.Booster; + import org.dmlc.xgboost4j.java.XGBoost; * Training @@ -126,13 +110,13 @@ With parameters and data, you are able to train a booster model. DMatrix trainMat = new DMatrix("train.svm.txt"); DMatrix validMat = new DMatrix("valid.svm.txt"); - //specify a watchList to see the performance - //any Iterable> object could be used as watchList - List> watchs = new ArrayList<>(); - watchs.add(new SimpleEntry<>("train", trainMat)); - watchs.add(new SimpleEntry<>("test", testMat)); - int round = 2; - Booster booster = Trainer.train(params, trainMat, round, watchs, null, null); + // Specify a watchList to see the performance + // Any Iterable> object could be used as watchList + List> watches = new ArrayList<>(); + watches.add(new SimpleEntry<>("train", trainMat)); + watches.add(new SimpleEntry<>("test", testMat)); + int nround = 2; + Booster booster = XGBoost.train(trainMat, params, nround, watches, null, null); * Saving model @@ -142,25 +126,19 @@ With parameters and data, you are able to train a booster model. booster.saveModel("model.bin"); -* Dump Model and Feature Map +* Generaing model dump with feature map .. code-block:: java - booster.dumpModel("modelInfo.txt", false) - //dump with featureMap - booster.dumpModel("modelInfo.txt", "featureMap.txt", false) + String[] model_dump = booster.getModelDump(null, false) + // dump with feature map + String[] model_dump_with_feature_map = booster.getModelDump("featureMap.txt", false) * Load a model .. code-block:: java - Params param = new Params() { - { - put("silent", 1); - put("nthread", 6); - } - }; - Booster booster = new Booster(param, "model.bin"); + Booster booster = Booster.loadModel("model.bin"); ********** Prediction @@ -170,8 +148,8 @@ After training and loading a model, you can use it to make prediction for other .. code-block:: java DMatrix dtest = new DMatrix("test.svm.txt"); - //predict + // predict float[][] predicts = booster.predict(dtest); - //predict leaf - float[][] leafPredicts = booster.predict(dtest, 0, true); + // predict leaf + float[][] leafPredicts = booster.predictLeaf(dtest, 0); From c217cf8a4b17636f134cb2a1540af084a6dbf0a7 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Wed, 1 Aug 2018 09:57:10 -0700 Subject: [PATCH 19/25] add note about using hdfs --- doc/jvm/xgboost4j_spark_tutorial.rst | 99 ++--- doc/tutorials/xgboost4j_spark_tutorial.rst | 460 +++++++++++++++++++++ 2 files changed, 512 insertions(+), 47 deletions(-) create mode 100644 doc/tutorials/xgboost4j_spark_tutorial.rst diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index f185fb2c4593..61951c8933b7 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -3,12 +3,12 @@ XGBoost4J-Spark Tutorial (version 0.8+) ####################################### **XGBoost4J-Spark** is a project aiming to seamlessly integrate XGBoost and Apache Spark by fitting XGBoost to Apache Spark's MLLIB framework. With the integration, user can not only uses the high-performant algorithm implementation of XGBoost, but also leverages the powerful data processing engine of Spark for: - + * Feature Engineering: feature extraction, transformation, dimensionality reduction, and selection, etc. * Pipelines: constructing, evaluating, and tuning ML Pipelines * Persistence: persist and load machine learning models and even whole Pipelines -This tutorial is to cover the end-to-end process to build a machine learning pipeline with XGBoost4J-Spark. We will discuss +This tutorial is to cover the end-to-end process to build a machine learning pipeline with XGBoost4J-Spark. We will discuss * Using Spark to preprocess data to fit to XGBoost/XGBoost4J-Spark's data interface * Training a XGBoost model with XGBoost4J-Spark @@ -27,7 +27,7 @@ Build an ML Application with XGBoost4J-Spark Refer to XGBoost4J-Spark Dependency =================================== -Before we go into the tour of how to use XGBoost4J-Spark, we would bring a brief introduction about how to build a machine learning application with XGBoost4J-Spark. The first thing you need to do is to refer to the dependency in Maven Central. +Before we go into the tour of how to use XGBoost4J-Spark, we would bring a brief introduction about how to build a machine learning application with XGBoost4J-Spark. The first thing you need to do is to refer to the dependency in Maven Central. You can add the following dependency in your ``pom.xml``. @@ -65,14 +65,14 @@ Data Preparation ================ As aforementioned, XGBoost4J-Spark seamlessly integrates Spark and XGBoost. The integration enables -users to apply various types of transformation over the training/test datasets with the convenient -and powerful data processing framework, Spark. - +users to apply various types of transformation over the training/test datasets with the convenient +and powerful data processing framework, Spark. + In this section, we use `Iris `_ dataset as an example to showcase how we use Spark to transform raw dataset and make it fit to the data interface of XGBoost. Iris dataset is shipped in CSV format. Each instance contains 4 features, "sepal length", "sepal width", -"petal length" and "petal width". In addition, it contains the "class" columnm, which is essentially the label with three possible values: "Iris Setosa", "Iris Versicolour" and "Iris Virginica". +"petal length" and "petal width". In addition, it contains the "class" columnm, which is essentially the label with three possible values: "Iris Setosa", "Iris Versicolour" and "Iris Virginica". Read Dataset with Spark's Built-In Reader ----------------------------------------- @@ -83,7 +83,7 @@ The first thing in data transformation is to load the dataset as Spark's structu import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} - + val spark = SparkSession.builder().getOrCreate() val schema = new StructType(Array( StructField("sepal length", DoubleType, true), @@ -96,11 +96,11 @@ The first thing in data transformation is to load the dataset as Spark's structu At the first line, we create a instance of `SparkSession `_ which is the entry of any Spark program working with DataFrame. The ``schema`` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types; otherwise the column name would be the default ones derived by Spark, such as ``_col0``, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named ``rawInput``. Spark also contains many built-in readers for other format. The latest version of Spark supports CSV, JSON, Parquet, and LIBSVM. - + Transform Raw Iris Dataset -------------------------- -To make Iris dataset be recognizable to XGBoost, we need to +To make Iris dataset be recognizable to XGBoost, we need to 1. Transform String-typed label, i.e. "class", to Double-typed label. 2. Assemble the feature columns as a vector to fit to the data interface of Spark ML framework. @@ -120,13 +120,13 @@ With a newly created StringIndexer instance: 1. we set input column, i.e. the column containing String-typed label 2. we set output column, i.e. the column to contain the Double-typed label. -3. Then we ``fit`` StringIndex with our input DataFrame ``rawInput``, so that Spark internals can get information like total number of distinct values, etc. +3. Then we ``fit`` StringIndex with our input DataFrame ``rawInput``, so that Spark internals can get information like total number of distinct values, etc. -Now we have a StringIndexer which is ready to be applied to our input DataFrame. To execute the transformation logic of StringIndexer, we ``transform`` the input DataFrame ``rawInput`` and to keep a concise DataFrame, +Now we have a StringIndexer which is ready to be applied to our input DataFrame. To execute the transformation logic of StringIndexer, we ``transform`` the input DataFrame ``rawInput`` and to keep a concise DataFrame, we drop the column "class" and only keeps the feature columns and the transformed Double-typed label column (in the last line of the above code snippet). The ``fit`` and ``transform`` are two key operations in MLLIB. Basically, ``fit`` produces a "transformer", e.g. StringIndexer, and each transformer applies ``transform`` method on DataFrame to add new column(s) containing transformed features/labels or prediction results, etc. To understand more about ``fit`` and ``transform``, You can find more details in `here `_. - + Similarly, we can use another transformer, `VectorAssembler `_, to assemble feature columns "sepal length", "sepal width", "petal length" and "petal width" as a vector. .. code-block:: scala @@ -141,7 +141,7 @@ Now, we have a DataFrame containing only two columns, "features" which contains "sepal length", "sepal width", "petal length" and "petal width" and "classIndex" which has Double-typed labels. A DataFrame like this (containing vector-represented features and numeric labels) can be fed to XGBoost4J-Spark's training engine directly. -Training +Training ======== XGBoost supports both regression and classification. While we use Iris dataset in this tutorial to show how we use XGBoost/XGBoost4J-Spark to resolve a multi-classes classification problem, the usage in Regression is very similar to classification. @@ -161,11 +161,11 @@ To train a XGBoost model for classification, we need to claim a XGBoostClassifie setFeaturesCol("features"). setLabelCol("classIndex") -The available parameters for training a XGBoost model can be found in :doc:`here `. In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case variant of these parameters to keep consistent with Spark's MLLIB parameters. +The available parameters for training a XGBoost model can be found in :doc:`here `. In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case variant of these parameters to keep consistent with Spark's MLLIB parameters. -Specifically, each parameter in :doc:`this page ` has its +Specifically, each parameter in :doc:`this page ` has its equivalent form in XGBoost4J-Spark with camel case. For example, to set ``max_depth`` for each tree, you can pass parameter just like what we did in the above code snippet (as ``max_depth`` wrapped in a Map), or you can do it through setters in XGBoostClassifer: - + .. code-block:: scala val xgbClassifier = new XGBoostClassifier(). @@ -241,7 +241,7 @@ However, the overhead of single-instance prediction is high due to the internal val features = xgbInput.head().getAs[Vector]("features") val result = xgbClassificationModel.predict(features) -Model Persistence +Model Persistence ================= Model and pipeline persistence @@ -263,13 +263,13 @@ and then loading the model in another session: .. code-block:: scala import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel - + val xgbClassificationModel2 = XGBoostClassificationModel.load(xgbClassificationModelPath) xgbClassificationModel2.transform(xgbInput) With regards to ML pipeline save and load, please refer the next section. -Export for Other Bindings of XGBoost +Interact with Other Bindings of XGBoost ------------------------------------ After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by: @@ -286,16 +286,22 @@ Then we can load this model with single node Python XGBoost: bst = xgb.Booster({'nthread': 4}) bst.load_model(nativeModelPath) +When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one: + +* You can build XGBoost4J-Spark with the steps described in `here `_, but turning `USE_HDFS `_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path. + +However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, we recommend download file manually from HDFS and load with the pre-built version of XGBoost. + .. note:: Consistency issue between XGBoost4J-Spark and other bindings - There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost. - + There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost. + When users use Spark to load training/test data in LIBSVM format with the following code snippet: - + .. code-block:: scala - + spark.read.format("libsvm").load("trainingset_libsvm") - + Spark assumes that the dataset is using 1-based indexing (feature indices staring with 1). However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is using 0-based indexing (feature indices starting with 0). It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based indexing before you predict with, for example, Python API. ******************************************* @@ -340,7 +346,7 @@ We need to organize these steps as a Pipeline in Spark ML framework and evaluate import org.apache.spark.ml.feature._ import org.apache.spark.ml.Pipeline - + val pipeline = new Pipeline() .setStages(Array(assembler, stringIndexer, booster, labelConverter)) val model = pipeline.fit(training) @@ -350,14 +356,14 @@ After we get the PipelineModel, we can make prediction on the test dataset and e .. code-block:: scala import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator - + val prediction = model.transform(test) val evaluator = new MulticlassClassificationEvaluator() val accuracy = evaluator.evaluate(prediction) Pipeline with Hyper-parameter Tunning ===================================== -The most critical operation to maximize the power of XGBoost is to select the optimal parameters for the model. Tuning parameters manually is a tedious and labor-consuming process. With the latest version of XGBoost4J-Spark, we can utilize the Spark model selecting tool to automate this process. +The most critical operation to maximize the power of XGBoost is to select the optimal parameters for the model. Tuning parameters manually is a tedious and labor-consuming process. With the latest version of XGBoost4J-Spark, we can utilize the Spark model selecting tool to automate this process. The following example shows the code snippet utilizing CrossValidation and MulticlassClassificationEvaluator to search the optimal combination of two XGBoost parameters, ``max_depth`` and ``eta``. (See :doc:`/parameter`.) @@ -368,7 +374,7 @@ The model producing the maximum accuracy defined by MulticlassClassificationEval import org.apache.spark.ml.tuning._ import org.apache.spark.ml.PipelineModel import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel - + val paramGrid = new ParamGridBuilder() .addGrid(booster.maxDepth, Array(3, 8)) .addGrid(booster.eta, Array(0.2, 0.6)) @@ -382,7 +388,7 @@ The model producing the maximum accuracy defined by MulticlassClassificationEval val cvModel = cv.fit(training) val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2) - .asInstanceOf[XGBoostClassificationModel] + .asInstanceOf[XGBoostClassificationModel] bestModel.extractParamMap() ********************************* @@ -393,7 +399,7 @@ XGBoost4J-Spark is one of the most important steps to bring XGBoost to productio Parallel/Distributed Training ============================= -The massive size of training dataset is one of the most significant characteristics in production environment. To ensure that training in XGBoost scales with the data size, XGBoost4J-Spark bridges the distributed/parallel processing framework of Spark and the parallel/distributed training mechanism of XGBoost. +The massive size of training dataset is one of the most significant characteristics in production environment. To ensure that training in XGBoost scales with the data size, XGBoost4J-Spark bridges the distributed/parallel processing framework of Spark and the parallel/distributed training mechanism of XGBoost. In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task and the training dataset in Spark's memory space is fed to XGBoost workers in a transparent approach to the user. @@ -403,28 +409,28 @@ This parameter controls how many parallel workers we want to have when training .. note:: Regarding OpenMP optimization By default, we allocate a core per each XGBoost worker. Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved - by running multiple workers (i.e. Spark tasks) at the same time. - - If you do want OpenMP optimization, you have to - + by running multiple workers (i.e. Spark tasks) at the same time. + + If you do want OpenMP optimization, you have to + 1. set ``nthread`` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor 2. set ``spark.task.cpus`` in Spark to the same value as ``nthread`` - + Gang Scheduling =============== XGBoost uses `AllReduce `_. algorithm to synchronize the stats, e.g. histogram values, of each worker during training. Therefore XGBoost4J-Spark requires that all of ``nthread * numWorkers`` cores should be available before the training runs. - + In the production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark application can get all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more resources to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. Additionally, this usually happens silently and does not bring the attention of users. - + XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, you can set with XGBoostClassifier/XGBoostRegressor: - + .. code-block:: scala xgbClassifier.setTimeoutRequestWorkers(60000L) - + or pass in ``timeout_request_workers`` in ``xgbParamMap`` when building XGBoostClassifier: - + .. code-block:: scala val xgbParam = Map("eta" -> 0.1f, @@ -437,7 +443,7 @@ or pass in ``timeout_request_workers`` in ``xgbParamMap`` when building XGBoostC val xgbClassifier = new XGBoostClassifier(xgbParam). setFeaturesCol("features"). setLabelCol("classIndex") - + If XGBoost4J-Spark cannot get enough resources for running two XGBoost workers, the application would fail. Users can have external mechanism to monitor the status of application and get notified for such case. Checkpoint During Training @@ -445,16 +451,16 @@ Checkpoint During Training Transient failures are also commonly seen in production environment. To simplify the design of XGBoost, we stop training if any of the distributed workers fail. However, if the training fails after having been through a long time, it would be a great waste of resources. - + We support creating checkpoint during training to facilitate more efficient recovery from failture. To enable this feature, you can set how many iterations we build each checkpoint with ``setCheckpointInterval`` and the location of checkpoints with ``setCheckpointPath``: - + .. code-block:: scala xgbClassifier.setCheckpointInterval(2) xgbClassifier.setCheckpointPath("/checkpoint_path") - + An equivalent way is to pass in parameters in XGBoostClassifier's constructor: - + .. code-block:: scala val xgbParam = Map("eta" -> 0.1f, @@ -470,4 +476,3 @@ An equivalent way is to pass in parameters in XGBoostClassifier's constructor: setLabelCol("classIndex") If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file in ``/checkpoints_path`` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. - diff --git a/doc/tutorials/xgboost4j_spark_tutorial.rst b/doc/tutorials/xgboost4j_spark_tutorial.rst new file mode 100644 index 000000000000..921ff4963c26 --- /dev/null +++ b/doc/tutorials/xgboost4j_spark_tutorial.rst @@ -0,0 +1,460 @@ +# XGBoost4J-Spark Tutorial (version >= 0.8) + +XGBoost4J-Spark is a project aiming to seamlessly integrate XGBoost and Apache Spark by fitting XGBoost to Apache Spark's MLLIB framework. With the integration, user can not only uses the high-performant algorithm implementation of XGBoost, but also leverages the powerful data processing engine of Spark for: + + * Feature Engineering: feature extraction, transformation, dimensionality reduction, and selection, etc. + * Pipelines: constructing, evaluating, and tuning ML Pipelines + * Persistence: persist and load machine learning models and even whole Pipelines + +This tutorial is to cover the end-to-end process to build a machine learning pipeline with XGBoost4J-Spark. We will discuss + + * Using Spark to preprocess data to fit to XGBoost/XGBoost4J-Spark's data interface + * Training a XGBoost model with XGBoost4J-Spark + * Serving XGBoost model (prediction) with Spark + * Building a Machine Learning Pipeline with XGBoost4J-Spark + * Running XGBoost4J-Spark in Production + +# Build an ML Application with XGBoost4J-Spark + +## Refer to XGBoost4J-Spark Dependency + +Before we go into the tour of how to use XGBoost4J-Spark, we would bring a brief introduction about how to build a machine learning application with XGBoost4J-Spark. The first thing you need to do is to refer to the dependency in Maven Central. + +You can add the following dependency in your pom file. + +```xml + + ml.dmlc + xgboost4j-spark + latest_version_num + +``` + +For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases). + +We also publish some functionalities which would be included in the coming release in the form of snapshot version. To access these functionalities, you can add dependency to the snapshot artifacts. We publish snapshot version in github-based repo, so you can add the following repo in pom.xml: + +```xml + + XGBoost4J-Spark Snapshot Repo + XGBoost4J-Spark Snapshot Repo + https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/ + +``` + +and then refer to the snapshot dependency by adding: + +```xml + + ml.dmlc + xgboost4j + next_version_num-SNAPSHOT + +``` + + +## Data Preparation + +As aforementioned, XGBoost4J-Spark seamlessly integrates Spark and XGBoost. The integration enables + users to apply various types of transformation over the training/test datasets with the convenient + and powerful data processing framework, Spark. + +In this section, we use [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset as an example to + showcase how we use Spark to transform raw dataset and make it fit to the data interface of XGBoost. + +Iris dataset is shipped in CSV format. Each instance contains 4 features, "sepal length", "sepal width", +"petal length" and "petal width". "class" column in each instance is essentially the label which has three possible values: "Iris Setosa", "Iris Versicolour" and "Iris Virginica". + +### Read Dataset with Spark's Built-In Reader + +The first thing in data transformation is to load the dataset as Spark's structured data abstraction, +DataFrame. + +```scala + import org.apache.spark.sql.SparkSession + import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} + + val spark = SparkSession.builder().getOrCreate() + val schema = new StructType(Array( + StructField("sepal length", DoubleType, true), + StructField("sepal width", DoubleType, true), + StructField("petal length", DoubleType, true), + StructField("petal width", DoubleType, true), + StructField("class", StringType, true))) + val rawInput = spark.read.schema(schema).csv("input_path") +``` + +At the first line, we create a instance of [SparkSession](http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession) which is the entry of any Spark program working with DataFrame. The `schema` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types, otherwise the column name would be the default ones derived by Spark, such as `_col0`, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named `rawInput`. + +Spark also contains many built-in readers for other format. The latest version of Spark supports, csv/json/parquet/libsvm. + +### Transform Raw Iris Dataset + +To make Iris dataset be recognizable to XGBoost, we need to + +1. Transform String-typed label, i.e. "class", to Double-typed label. + +2. Assemble the feature columns as a vector to fit to the data interface of Spark ML framework. + +To convert String-typed label to Double, we can use Spark's built-in feature transformer StringIndexer. + +```scala + import org.apache.spark.ml.feature.StringIndexer + val stringIndexer = new StringIndexer(). + setInputCol("class"). + setOutputCol("classIndex"). + fit(rawInput) + val labelTransformed = stringIndexer.transform(rawInput).drop("class") +``` + +With a newly created StringIndexer instance: + +1. we set input column, i.e. the column containing String-typed label +2. we set output column, i.e. the column to contain the Double-typed label. +3. Then we `fit` StringIndex with our input DataFrame, 'rawInput', so that Spark internals can get information like total number of distinct values, etc. + +Now we have a StringIndexer which is ready to be applied to our input DataFrame. To execute the transformation logic of StringIndexer, we `transform` the input DataFrame, 'rawInput' and to keep a concise DataFrame, +we drop the column `class` and only keeps the feature columns and the transformed Double-typed label column (in the last line of the above code snippet). + +`fit` and `transform` are two key operations in MLLIB. Basically, `fit` produces a "transformer", e.g. StringIndexer, and each transformer applies `transform` method on DataFrame to add new column(s) containing transformed features/labels or prediction results, etc. To understand more about `fit` and `transform`, You can find more details in [here](http://spark.apache.org/docs/latest/ml-pipeline.html#pipeline-components). + +Similarly, we can use another transformer, 'VectorAssembler', to assemble feature columns "sepal length", "sepal width", "petal length" and "petal width" as a vector. + +```scala + import org.apache.spark.ml.feature.VectorAssembler + val vectorAssembler = new VectorAssembler(). + setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")). + setOutputCol("features") + val xgbInput = vectorAssembler.transform(labelTransformed).select("features", + "classIndex") +``` + +Now, we have a DataFrame containing only two columns, "features" which contains vector-represented +"sepal length", "sepal width", "petal length" and "petal width" and "classIndex" which has Double-typed +labels. A DataFrame like this (containing vector-represented features and numeric labels) can be fed to XGBoost4J-Spark's training engine directly. + + +## Training + +XGBoost support both Regression and Classification. While we use Iris dataset in this tutorial to show how we use XGBoost/XGBoost4J-Spark to resolve a multi-classes Classification problem, The usage in Regression is very similar to Classification. + +To train a XGBoost model for classification, we need to claim a XGBoostClassifier first: + +```scala + import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") +``` + +The available parameters for training a XGBoost model can be found in [here](https://xgboost.readthedocs.io/en/latest/parameter.html). In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case-variance of these parameters to keep consistent with Spark's MLLIB parameters. + +Specifically, each parameter in [here](https://xgboost.readthedocs.io/en/latest/parameter.html) has its +equivalent form in XGBoost4J-Spark with camel case. For example, to set max_depth for each tree, you can pass parameter just like what we did in the above code snippet (as `max_depth` wrapped in a Map), or you can do it through setters in XGBoostClassifer: + + ```scala + val xgbClassifier = new XGBoostClassifier(). + setFeaturesCol("features"). + setLabelCol("classIndex") + xgbClassifier.setMaxDepth(2) + ``` + +After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel by fitting XGBoostClassifier with the input DataFrame. This `fit` operation is essentially the training process and the generated model can then be used in Prediction. + +```scala + val xgbClassificationModel = xgbClassifier.fit(xgbInput) +``` + +## Prediction + +XGBoost4j-Spark supports two ways for model serving: batch prediction and single instance prediction. + +### Batch Prediction + +When we get a model, either `XGBoostClassificationModel` or `XGBoostRegressionModel`, it takes a DataFrame, read the column containing feature vectors, predict for each feature vector, and output a new DataFrame with the following columns by default: + +* `XGBoostClassificationModel` will output margins (`rawPredictionCol`), probabilities(`probabilityCol`) and the eventual prediction labels (`predictionCol`) for each possible label. +* `XGBoostRegressionModel` will output prediction label(`predictionCol`). + +Batch Prediction expects the user to pass the testset in the form of a DataFrame. XGBoost4J-Spark starts a XGBoost worker for each partition of DataFrame for parallel prediction and generates prediction results for the whole DataFrame in a batch. + +```scala + val xgbClassificationModel = xgbClassifier.fit(xgbInput) + val results = xgbClassificationModel.transform(testSet) +``` + +With the above code snippet, we get a result DataFrame, result containing margin, probability for each class and the prediction for each instance + +```scala ++-----------------+----------+--------------------+--------------------+----------+ +| features|classIndex| rawPrediction| probability|prediction| ++-----------------+----------+--------------------+--------------------+----------+ +|[5.1,3.5,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[4.9,3.0,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[4.7,3.2,1.3,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| +|[4.6,3.1,1.5,0.2]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| +|[5.0,3.6,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[5.4,3.9,1.7,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| +|[4.6,3.4,1.4,0.3]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| +|[5.0,3.4,1.5,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[4.4,2.9,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[4.9,3.1,1.5,0.1]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| +|[5.4,3.7,1.5,0.2]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| +|[4.8,3.4,1.6,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| +|[4.8,3.0,1.4,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[4.3,3.0,1.1,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| +|[5.8,4.0,1.2,0.2]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| +|[5.7,4.4,1.5,0.4]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| +|[5.4,3.9,1.3,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| +|[5.1,3.5,1.4,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| +|[5.7,3.8,1.7,0.3]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| +|[5.1,3.8,1.5,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| ++-----------------+----------+--------------------+--------------------+----------+ + +``` + +### Single instance prediction + +`XGBoostClassificationModel` or `XGBoostRegressionModel` support make prediction on single instance as well. +It accepts a single Vector as feature, and output the prediction label. + +However, the overhead of single-instance prediction is high due to the internal overhead of XGBoost, use it carefully! + +```scala + val features = xgbInput.head().getAs[Vector]("features") + val result = xgbClassificationModel.predict(features) +``` + +## Model Persistence + +### Model and pipeline persistence + +A data scientist produces an ML model and hands it over to an engineering team for deployment in a production environment. Reversely, a trained model may be used by data scientists, for example as a baseline, across the process of data exploration. So it's important to support model persistence to make the models available across usage scenarios and programming languages. + +XGBoost4j-Spark supports save/load `XGBoostClassifier`/`XGBoostClassificationModel` and `XGBoostRegressor`/`XGBoostRegressionModel`, it also support save/load a ML pipeline which includes these estimators and models. + +We can save the XGBoostClassificationModel to file system: + +```scala + val xgbClassificationModelPath = "/tmp/xgbClassificationModel" + xgbClassificationModel.write.overwrite().save(xgbClassificationModelPath) +``` + +and then loading the model in another session: + +```scala + import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel + + val xgbClassificationModel2 = XGBoostClassificationModel.load(xgbClassificationModelPath) + xgbClassificationModel2.transform(xgbInput) +``` + +With regards to ML pipeline save and load, please refer the next section. + +### Interacting with Other Bindings of XGBoost + +After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by: + +```scala + val nativeModelPath = "/tmp/nativeModel" + xgbClassificationModel.nativeBooster.saveModel(nativeModelPath) +``` + +Then we can load this model with single node Python XGBoost: + +```python + import xgboost as xgb + bst = xgb.Booster({'nthread': 4}) + bst.load_model(nativeModelPath) +``` + +#### Passing Models through Non-local File System + +When interacting with other language bindings, we also support "saving-models-to" and "loading-models-from" file systems other than the local one. In general we have two approaches to do this: + +* Approach 1: + +NOTE: + +There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost. + +When users use Spark to load trainingset/testset in LibSVM format with the following code snippet: + +```scala +spark.read.format("libsvm").load("trainingset_libsvm") +``` + +Spark assumes that the dataset is 1-based indexed. However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is 0-based indexed. It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based before you predict with, for example, Python API. + +# Building a ML Pipeline with XGBoost4J-Spark + +## Basic ML Pipeline + +Spark ML pipeline can combine multiple algorithms or functions into a single pipeline. +It covers from feature extraction/transformation/selection to model training/prediction. +XGBoost4j-Spark makes it feasible to embed XGBoost into such a pipeline seamlessly. +The following example shows how to build such a pipeline consisting of Spark MLlib feature transformer +and XGBoostClassifier estimator. + +We still use [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset and the ```rawInput``` DataFrame. +First we need to split the dataset into training and test dataset. + +```scala + val Array(training, test) = rawInput.randomSplit(Array(0.8, 0.2), 123) +``` + +The we build the ML `Pipeline` which includes 4 stages: +* Assemble all features into a single vector column. +* From string label to indexed double label. +* Use `XGBoostClassifier` to train classification model. +* Convert indexed double label back to original string label. + +We have shown the first three steps in the earlier sections, and the last step is finished with a new Transformer IndexToString: + +```scala + val labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("realLabel") + .setLabels(stringIndexer.labels) +``` + +We need to organize these steps as a `Pipeline` in Spark ML framework and evaluate the whole pipeline to get a `PipelineModel`: + +```scala + import org.apache.spark.ml.feature._ + import org.apache.spark.ml.Pipeline + + val pipeline = new Pipeline() + .setStages(Array(assembler, stringIndexer, booster, labelConverter)) + val model = pipeline.fit(training) +``` + +After we get the PipelineModel, we can make prediction on the test dataset and evaluate the model accuracy. + +```scala + import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator + + val prediction = model.transform(test) + val evaluator = new MulticlassClassificationEvaluator() + val accuracy = evaluator.evaluate(prediction) +``` + +## Pipeline with Hyper-parameter Tunning + +The most critical operation to maximize the power of XGBoost is to select the optimal parameters for the model. Tuning parameters manually is a tedious and labor-consuming process. With the latest version of XGBoost4J-Spark, we can utilize the Spark model selecting tool to automate this process. + +The following example shows the code snippet utilizing `CrossValidation` and `MulticlassClassificationEvaluator` +to search the optimal combination of two XGBoost parameters, [`max_depth` and `eta`](https://github.com/dmlc/xgboost/blob/master/doc/parameter.md). +The model producing the maximum accuracy defined by `MulticlassClassificationEvaluator` is selected and used to generate the prediction for the test set. + +```scala + import org.apache.spark.ml.tuning._ + import org.apache.spark.ml.PipelineModel + import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel + + val paramGrid = new ParamGridBuilder() + .addGrid(booster.maxDepth, Array(3, 8)) + .addGrid(booster.eta, Array(0.2, 0.6)) + .build() + val cv = new CrossValidator() + .setEstimator(pipeline) + .setEvaluator(evaluator) + .setEstimatorParamMaps(paramGrid) + .setNumFolds(3) + + val cvModel = cv.fit(training) + + val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2) + .asInstanceOf[XGBoostClassificationModel] + bestModel.extractParamMap() +``` + +# Run XGBoost4J-Spark in Production + + +XGBoost4J-Spark is one of the most important steps to bring XGBoost to production environment easier. In this section, we introduce three key features to run XGBoost4J-Spark in production. + +## Parallel/Distributed Training + +The massive size of training dataset is one of the most significant characteristics in production environment. To ensure that training in XGBoost scales with the data size, XGBoost4J-Spark bridges the distributed/parallel processing framework of Spark and the parallel/distributed training mechanism of XGBoost. + +In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task and the training dataset in Spark's memory space is fed to XGBoost workers in a transparent approach to the user. + +In the code snippet where we build XGBoostClassifier, we set parameter "num_workers" (or "numWorkers"). +This parameter controls how many parallel workers we want to have when training a XGBoostClassificationModel. + + By default, we allocate a core per each XGBoost worker. Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved + by running multiple workers (i.e. Spark tasks) at the same time. + + If you do want OpenMP optimization, you have to + + 1. set `nthread` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor + + 2. set `spark.task.cpus` in Spark to the same value as `nthread` + +## Gang Scheduling + +XGBoost uses [AllReduce](http://mpitutorial.com/tutorials/mpi-reduce-and-allreduce/) +algorithm to synchronize the stats, e.g. histogram values, of each worker during training. Therefore XGBoost4J-Spark requires that all of `nthread * numWorkers` cores should be available before the training runs. + +In the production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark application can get all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more resources to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. Additionally, this usually happens silently and does not bring the attention of users. + +XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, you can set with XGBoostClassifier/XGBoostRegressor: + + ```scala + xgbClassifier.setTimeoutRequestWorkers(60000L) + ``` + + or pass in `timeout_request_workers` in xgbParamMap when building XGBoostClassifier + + ```scala + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2, + "timeout_request_workers" -> 60000L) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") + ``` + +If XGBoost4J-Spark cannot get enough resources for running two XGBoost workers, the application would fail. Users can have external mechanism to monitor the status of application and get notified for such case. + +## Checkpoint During Training + +Transient failures are also commonly seen in production environment. To simplify the design of XGBoost, + we stop training if any of the distributed workers fail. However, if the training fails after having been through a long time, it would be a great resource waste on failing. + + +We support creating checkpoint during training to facilitate more efficient failure recovery. To enable this feature, you can set how many iterations we build each checkpoint with `setCheckpointInterval` and the path store checkpointPath with `setCheckpointPath`: + + ```scala + xgbClassifier.setCheckpointInterval(2) + xgbClassifier.setCheckpointPath("/checkpoint_path") + ``` + + an equivalent way is to pass in parameters in XGBoostClassifier's constructor: + + ```scala + val xgbParam = Map("eta" -> 0.1f, + "max_depth" -> 2, + "objective" -> "multi:softprob", + "num_class" -> 3, + "num_round" -> 100, + "num_workers" -> 2, + "checkpoint_path" -> "/checkpoints_path", + "checkpoint_interval" -> 2) + val xgbClassifier = new XGBoostClassifier(xgbParam). + setFeaturesCol("features"). + setLabelCol("classIndex") + ``` + +If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file in `/checkpoints_path` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. From 17c65aacff0c7035ca08f7219dbc6fe7acf3d68c Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Wed, 1 Aug 2018 09:57:36 -0700 Subject: [PATCH 20/25] remove duplicate file --- doc/tutorials/xgboost4j_spark_tutorial.rst | 460 --------------------- 1 file changed, 460 deletions(-) delete mode 100644 doc/tutorials/xgboost4j_spark_tutorial.rst diff --git a/doc/tutorials/xgboost4j_spark_tutorial.rst b/doc/tutorials/xgboost4j_spark_tutorial.rst deleted file mode 100644 index 921ff4963c26..000000000000 --- a/doc/tutorials/xgboost4j_spark_tutorial.rst +++ /dev/null @@ -1,460 +0,0 @@ -# XGBoost4J-Spark Tutorial (version >= 0.8) - -XGBoost4J-Spark is a project aiming to seamlessly integrate XGBoost and Apache Spark by fitting XGBoost to Apache Spark's MLLIB framework. With the integration, user can not only uses the high-performant algorithm implementation of XGBoost, but also leverages the powerful data processing engine of Spark for: - - * Feature Engineering: feature extraction, transformation, dimensionality reduction, and selection, etc. - * Pipelines: constructing, evaluating, and tuning ML Pipelines - * Persistence: persist and load machine learning models and even whole Pipelines - -This tutorial is to cover the end-to-end process to build a machine learning pipeline with XGBoost4J-Spark. We will discuss - - * Using Spark to preprocess data to fit to XGBoost/XGBoost4J-Spark's data interface - * Training a XGBoost model with XGBoost4J-Spark - * Serving XGBoost model (prediction) with Spark - * Building a Machine Learning Pipeline with XGBoost4J-Spark - * Running XGBoost4J-Spark in Production - -# Build an ML Application with XGBoost4J-Spark - -## Refer to XGBoost4J-Spark Dependency - -Before we go into the tour of how to use XGBoost4J-Spark, we would bring a brief introduction about how to build a machine learning application with XGBoost4J-Spark. The first thing you need to do is to refer to the dependency in Maven Central. - -You can add the following dependency in your pom file. - -```xml - - ml.dmlc - xgboost4j-spark - latest_version_num - -``` - -For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases). - -We also publish some functionalities which would be included in the coming release in the form of snapshot version. To access these functionalities, you can add dependency to the snapshot artifacts. We publish snapshot version in github-based repo, so you can add the following repo in pom.xml: - -```xml - - XGBoost4J-Spark Snapshot Repo - XGBoost4J-Spark Snapshot Repo - https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/ - -``` - -and then refer to the snapshot dependency by adding: - -```xml - - ml.dmlc - xgboost4j - next_version_num-SNAPSHOT - -``` - - -## Data Preparation - -As aforementioned, XGBoost4J-Spark seamlessly integrates Spark and XGBoost. The integration enables - users to apply various types of transformation over the training/test datasets with the convenient - and powerful data processing framework, Spark. - -In this section, we use [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset as an example to - showcase how we use Spark to transform raw dataset and make it fit to the data interface of XGBoost. - -Iris dataset is shipped in CSV format. Each instance contains 4 features, "sepal length", "sepal width", -"petal length" and "petal width". "class" column in each instance is essentially the label which has three possible values: "Iris Setosa", "Iris Versicolour" and "Iris Virginica". - -### Read Dataset with Spark's Built-In Reader - -The first thing in data transformation is to load the dataset as Spark's structured data abstraction, -DataFrame. - -```scala - import org.apache.spark.sql.SparkSession - import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} - - val spark = SparkSession.builder().getOrCreate() - val schema = new StructType(Array( - StructField("sepal length", DoubleType, true), - StructField("sepal width", DoubleType, true), - StructField("petal length", DoubleType, true), - StructField("petal width", DoubleType, true), - StructField("class", StringType, true))) - val rawInput = spark.read.schema(schema).csv("input_path") -``` - -At the first line, we create a instance of [SparkSession](http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession) which is the entry of any Spark program working with DataFrame. The `schema` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types, otherwise the column name would be the default ones derived by Spark, such as `_col0`, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named `rawInput`. - -Spark also contains many built-in readers for other format. The latest version of Spark supports, csv/json/parquet/libsvm. - -### Transform Raw Iris Dataset - -To make Iris dataset be recognizable to XGBoost, we need to - -1. Transform String-typed label, i.e. "class", to Double-typed label. - -2. Assemble the feature columns as a vector to fit to the data interface of Spark ML framework. - -To convert String-typed label to Double, we can use Spark's built-in feature transformer StringIndexer. - -```scala - import org.apache.spark.ml.feature.StringIndexer - val stringIndexer = new StringIndexer(). - setInputCol("class"). - setOutputCol("classIndex"). - fit(rawInput) - val labelTransformed = stringIndexer.transform(rawInput).drop("class") -``` - -With a newly created StringIndexer instance: - -1. we set input column, i.e. the column containing String-typed label -2. we set output column, i.e. the column to contain the Double-typed label. -3. Then we `fit` StringIndex with our input DataFrame, 'rawInput', so that Spark internals can get information like total number of distinct values, etc. - -Now we have a StringIndexer which is ready to be applied to our input DataFrame. To execute the transformation logic of StringIndexer, we `transform` the input DataFrame, 'rawInput' and to keep a concise DataFrame, -we drop the column `class` and only keeps the feature columns and the transformed Double-typed label column (in the last line of the above code snippet). - -`fit` and `transform` are two key operations in MLLIB. Basically, `fit` produces a "transformer", e.g. StringIndexer, and each transformer applies `transform` method on DataFrame to add new column(s) containing transformed features/labels or prediction results, etc. To understand more about `fit` and `transform`, You can find more details in [here](http://spark.apache.org/docs/latest/ml-pipeline.html#pipeline-components). - -Similarly, we can use another transformer, 'VectorAssembler', to assemble feature columns "sepal length", "sepal width", "petal length" and "petal width" as a vector. - -```scala - import org.apache.spark.ml.feature.VectorAssembler - val vectorAssembler = new VectorAssembler(). - setInputCols(Array("sepal length", "sepal width", "petal length", "petal width")). - setOutputCol("features") - val xgbInput = vectorAssembler.transform(labelTransformed).select("features", - "classIndex") -``` - -Now, we have a DataFrame containing only two columns, "features" which contains vector-represented -"sepal length", "sepal width", "petal length" and "petal width" and "classIndex" which has Double-typed -labels. A DataFrame like this (containing vector-represented features and numeric labels) can be fed to XGBoost4J-Spark's training engine directly. - - -## Training - -XGBoost support both Regression and Classification. While we use Iris dataset in this tutorial to show how we use XGBoost/XGBoost4J-Spark to resolve a multi-classes Classification problem, The usage in Regression is very similar to Classification. - -To train a XGBoost model for classification, we need to claim a XGBoostClassifier first: - -```scala - import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier - val xgbParam = Map("eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "multi:softprob", - "num_class" -> 3, - "num_round" -> 100, - "num_workers" -> 2) - val xgbClassifier = new XGBoostClassifier(xgbParam). - setFeaturesCol("features"). - setLabelCol("classIndex") -``` - -The available parameters for training a XGBoost model can be found in [here](https://xgboost.readthedocs.io/en/latest/parameter.html). In XGBoost4J-Spark, we support not only the default set of parameters but also the camel-case-variance of these parameters to keep consistent with Spark's MLLIB parameters. - -Specifically, each parameter in [here](https://xgboost.readthedocs.io/en/latest/parameter.html) has its -equivalent form in XGBoost4J-Spark with camel case. For example, to set max_depth for each tree, you can pass parameter just like what we did in the above code snippet (as `max_depth` wrapped in a Map), or you can do it through setters in XGBoostClassifer: - - ```scala - val xgbClassifier = new XGBoostClassifier(). - setFeaturesCol("features"). - setLabelCol("classIndex") - xgbClassifier.setMaxDepth(2) - ``` - -After we set XGBoostClassifier parameters and feature/label column, we can build a transformer, XGBoostClassificationModel by fitting XGBoostClassifier with the input DataFrame. This `fit` operation is essentially the training process and the generated model can then be used in Prediction. - -```scala - val xgbClassificationModel = xgbClassifier.fit(xgbInput) -``` - -## Prediction - -XGBoost4j-Spark supports two ways for model serving: batch prediction and single instance prediction. - -### Batch Prediction - -When we get a model, either `XGBoostClassificationModel` or `XGBoostRegressionModel`, it takes a DataFrame, read the column containing feature vectors, predict for each feature vector, and output a new DataFrame with the following columns by default: - -* `XGBoostClassificationModel` will output margins (`rawPredictionCol`), probabilities(`probabilityCol`) and the eventual prediction labels (`predictionCol`) for each possible label. -* `XGBoostRegressionModel` will output prediction label(`predictionCol`). - -Batch Prediction expects the user to pass the testset in the form of a DataFrame. XGBoost4J-Spark starts a XGBoost worker for each partition of DataFrame for parallel prediction and generates prediction results for the whole DataFrame in a batch. - -```scala - val xgbClassificationModel = xgbClassifier.fit(xgbInput) - val results = xgbClassificationModel.transform(testSet) -``` - -With the above code snippet, we get a result DataFrame, result containing margin, probability for each class and the prediction for each instance - -```scala -+-----------------+----------+--------------------+--------------------+----------+ -| features|classIndex| rawPrediction| probability|prediction| -+-----------------+----------+--------------------+--------------------+----------+ -|[5.1,3.5,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[4.9,3.0,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[4.7,3.2,1.3,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| -|[4.6,3.1,1.5,0.2]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| -|[5.0,3.6,1.4,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[5.4,3.9,1.7,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| -|[4.6,3.4,1.4,0.3]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| -|[5.0,3.4,1.5,0.2]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[4.4,2.9,1.4,0.2]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[4.9,3.1,1.5,0.1]| 0.0|[3.45569849014282...|[0.99636095762252...| 0.0| -|[5.4,3.7,1.5,0.2]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| -|[4.8,3.4,1.6,0.2]| 0.0|[3.45569849014282...|[0.99643349647521...| 0.0| -|[4.8,3.0,1.4,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[4.3,3.0,1.1,0.1]| 0.0|[3.45569849014282...|[0.99618089199066...| 0.0| -|[5.8,4.0,1.2,0.2]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| -|[5.7,4.4,1.5,0.4]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| -|[5.4,3.9,1.3,0.4]| 0.0|[3.45569849014282...|[0.99428516626358...| 0.0| -|[5.1,3.5,1.4,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -|[5.7,3.8,1.7,0.3]| 0.0|[3.45569849014282...|[0.97809928655624...| 0.0| -|[5.1,3.8,1.5,0.3]| 0.0|[3.45569849014282...|[0.99579632282257...| 0.0| -+-----------------+----------+--------------------+--------------------+----------+ - -``` - -### Single instance prediction - -`XGBoostClassificationModel` or `XGBoostRegressionModel` support make prediction on single instance as well. -It accepts a single Vector as feature, and output the prediction label. - -However, the overhead of single-instance prediction is high due to the internal overhead of XGBoost, use it carefully! - -```scala - val features = xgbInput.head().getAs[Vector]("features") - val result = xgbClassificationModel.predict(features) -``` - -## Model Persistence - -### Model and pipeline persistence - -A data scientist produces an ML model and hands it over to an engineering team for deployment in a production environment. Reversely, a trained model may be used by data scientists, for example as a baseline, across the process of data exploration. So it's important to support model persistence to make the models available across usage scenarios and programming languages. - -XGBoost4j-Spark supports save/load `XGBoostClassifier`/`XGBoostClassificationModel` and `XGBoostRegressor`/`XGBoostRegressionModel`, it also support save/load a ML pipeline which includes these estimators and models. - -We can save the XGBoostClassificationModel to file system: - -```scala - val xgbClassificationModelPath = "/tmp/xgbClassificationModel" - xgbClassificationModel.write.overwrite().save(xgbClassificationModelPath) -``` - -and then loading the model in another session: - -```scala - import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel - - val xgbClassificationModel2 = XGBoostClassificationModel.load(xgbClassificationModelPath) - xgbClassificationModel2.transform(xgbInput) -``` - -With regards to ML pipeline save and load, please refer the next section. - -### Interacting with Other Bindings of XGBoost - -After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by: - -```scala - val nativeModelPath = "/tmp/nativeModel" - xgbClassificationModel.nativeBooster.saveModel(nativeModelPath) -``` - -Then we can load this model with single node Python XGBoost: - -```python - import xgboost as xgb - bst = xgb.Booster({'nthread': 4}) - bst.load_model(nativeModelPath) -``` - -#### Passing Models through Non-local File System - -When interacting with other language bindings, we also support "saving-models-to" and "loading-models-from" file systems other than the local one. In general we have two approaches to do this: - -* Approach 1: - -NOTE: - -There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost. - -When users use Spark to load trainingset/testset in LibSVM format with the following code snippet: - -```scala -spark.read.format("libsvm").load("trainingset_libsvm") -``` - -Spark assumes that the dataset is 1-based indexed. However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is 0-based indexed. It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based before you predict with, for example, Python API. - -# Building a ML Pipeline with XGBoost4J-Spark - -## Basic ML Pipeline - -Spark ML pipeline can combine multiple algorithms or functions into a single pipeline. -It covers from feature extraction/transformation/selection to model training/prediction. -XGBoost4j-Spark makes it feasible to embed XGBoost into such a pipeline seamlessly. -The following example shows how to build such a pipeline consisting of Spark MLlib feature transformer -and XGBoostClassifier estimator. - -We still use [Iris](https://archive.ics.uci.edu/ml/datasets/iris) dataset and the ```rawInput``` DataFrame. -First we need to split the dataset into training and test dataset. - -```scala - val Array(training, test) = rawInput.randomSplit(Array(0.8, 0.2), 123) -``` - -The we build the ML `Pipeline` which includes 4 stages: -* Assemble all features into a single vector column. -* From string label to indexed double label. -* Use `XGBoostClassifier` to train classification model. -* Convert indexed double label back to original string label. - -We have shown the first three steps in the earlier sections, and the last step is finished with a new Transformer IndexToString: - -```scala - val labelConverter = new IndexToString() - .setInputCol("prediction") - .setOutputCol("realLabel") - .setLabels(stringIndexer.labels) -``` - -We need to organize these steps as a `Pipeline` in Spark ML framework and evaluate the whole pipeline to get a `PipelineModel`: - -```scala - import org.apache.spark.ml.feature._ - import org.apache.spark.ml.Pipeline - - val pipeline = new Pipeline() - .setStages(Array(assembler, stringIndexer, booster, labelConverter)) - val model = pipeline.fit(training) -``` - -After we get the PipelineModel, we can make prediction on the test dataset and evaluate the model accuracy. - -```scala - import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator - - val prediction = model.transform(test) - val evaluator = new MulticlassClassificationEvaluator() - val accuracy = evaluator.evaluate(prediction) -``` - -## Pipeline with Hyper-parameter Tunning - -The most critical operation to maximize the power of XGBoost is to select the optimal parameters for the model. Tuning parameters manually is a tedious and labor-consuming process. With the latest version of XGBoost4J-Spark, we can utilize the Spark model selecting tool to automate this process. - -The following example shows the code snippet utilizing `CrossValidation` and `MulticlassClassificationEvaluator` -to search the optimal combination of two XGBoost parameters, [`max_depth` and `eta`](https://github.com/dmlc/xgboost/blob/master/doc/parameter.md). -The model producing the maximum accuracy defined by `MulticlassClassificationEvaluator` is selected and used to generate the prediction for the test set. - -```scala - import org.apache.spark.ml.tuning._ - import org.apache.spark.ml.PipelineModel - import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel - - val paramGrid = new ParamGridBuilder() - .addGrid(booster.maxDepth, Array(3, 8)) - .addGrid(booster.eta, Array(0.2, 0.6)) - .build() - val cv = new CrossValidator() - .setEstimator(pipeline) - .setEvaluator(evaluator) - .setEstimatorParamMaps(paramGrid) - .setNumFolds(3) - - val cvModel = cv.fit(training) - - val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2) - .asInstanceOf[XGBoostClassificationModel] - bestModel.extractParamMap() -``` - -# Run XGBoost4J-Spark in Production - - -XGBoost4J-Spark is one of the most important steps to bring XGBoost to production environment easier. In this section, we introduce three key features to run XGBoost4J-Spark in production. - -## Parallel/Distributed Training - -The massive size of training dataset is one of the most significant characteristics in production environment. To ensure that training in XGBoost scales with the data size, XGBoost4J-Spark bridges the distributed/parallel processing framework of Spark and the parallel/distributed training mechanism of XGBoost. - -In XGBoost4J-Spark, each XGBoost worker is wrapped by a Spark task and the training dataset in Spark's memory space is fed to XGBoost workers in a transparent approach to the user. - -In the code snippet where we build XGBoostClassifier, we set parameter "num_workers" (or "numWorkers"). -This parameter controls how many parallel workers we want to have when training a XGBoostClassificationModel. - - By default, we allocate a core per each XGBoost worker. Therefore, the OpenMP optimization within each XGBoost worker does not take effect and the parallelization of training is achieved - by running multiple workers (i.e. Spark tasks) at the same time. - - If you do want OpenMP optimization, you have to - - 1. set `nthread` to a value larger than 1 when creating XGBoostClassifier/XGBoostRegressor - - 2. set `spark.task.cpus` in Spark to the same value as `nthread` - -## Gang Scheduling - -XGBoost uses [AllReduce](http://mpitutorial.com/tutorials/mpi-reduce-and-allreduce/) -algorithm to synchronize the stats, e.g. histogram values, of each worker during training. Therefore XGBoost4J-Spark requires that all of `nthread * numWorkers` cores should be available before the training runs. - -In the production environment where many users share the same cluster, it's hard to guarantee that your XGBoost4J-Spark application can get all requested resources for every run. By default, the communication layer in XGBoost will block the whole application when it requires more resources to be available. This process usually brings unnecessary resource waste as it keeps the ready resources and try to claim more. Additionally, this usually happens silently and does not bring the attention of users. - -XGBoost4J-Spark allows the user to setup a timeout threshold for claiming resources from the cluster. If the application cannot get enough resources within this time period, the application would fail instead of wasting resources for hanging long. To enable this feature, you can set with XGBoostClassifier/XGBoostRegressor: - - ```scala - xgbClassifier.setTimeoutRequestWorkers(60000L) - ``` - - or pass in `timeout_request_workers` in xgbParamMap when building XGBoostClassifier - - ```scala - val xgbParam = Map("eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "multi:softprob", - "num_class" -> 3, - "num_round" -> 100, - "num_workers" -> 2, - "timeout_request_workers" -> 60000L) - val xgbClassifier = new XGBoostClassifier(xgbParam). - setFeaturesCol("features"). - setLabelCol("classIndex") - ``` - -If XGBoost4J-Spark cannot get enough resources for running two XGBoost workers, the application would fail. Users can have external mechanism to monitor the status of application and get notified for such case. - -## Checkpoint During Training - -Transient failures are also commonly seen in production environment. To simplify the design of XGBoost, - we stop training if any of the distributed workers fail. However, if the training fails after having been through a long time, it would be a great resource waste on failing. - - -We support creating checkpoint during training to facilitate more efficient failure recovery. To enable this feature, you can set how many iterations we build each checkpoint with `setCheckpointInterval` and the path store checkpointPath with `setCheckpointPath`: - - ```scala - xgbClassifier.setCheckpointInterval(2) - xgbClassifier.setCheckpointPath("/checkpoint_path") - ``` - - an equivalent way is to pass in parameters in XGBoostClassifier's constructor: - - ```scala - val xgbParam = Map("eta" -> 0.1f, - "max_depth" -> 2, - "objective" -> "multi:softprob", - "num_class" -> 3, - "num_round" -> 100, - "num_workers" -> 2, - "checkpoint_path" -> "/checkpoints_path", - "checkpoint_interval" -> 2) - val xgbClassifier = new XGBoostClassifier(xgbParam). - setFeaturesCol("features"). - setLabelCol("classIndex") - ``` - -If the training failed during these 100 rounds, the next run of training would start by reading the latest checkpoint file in `/checkpoints_path` and start from the iteration when the checkpoint was built until to next failure or the specified 100 rounds. From e469c35db51cbf6f99f90d221f6539422b956346 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Wed, 1 Aug 2018 14:54:57 -0700 Subject: [PATCH 21/25] fix descriptions --- doc/jvm/xgboost4j_spark_tutorial.rst | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index 61951c8933b7..84ded5a8511f 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -290,7 +290,20 @@ When interacting with other language bindings, XGBoost also supports saving-mode * You can build XGBoost4J-Spark with the steps described in `here `_, but turning `USE_HDFS `_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path. -However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, we recommend download file manually from HDFS and load with the pre-built version of XGBoost. +However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, we recommend the following steps to pass models with HDFS, S3, etc. (taking HDFS as an example) + +* create a new file with **val outputStream = fs.create("hdfs_path")** + +* pass the returned OutputStream in the first step to **xgbClassificationModel.nativeBooster.saveModel(outputStream)** + +* download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost: + +.. code-block:: python + + import xgboost as xgb + bst = xgb.Booster({'nthread': 4}) + local_path = download_from_hdfs("hdfs_path") + bst.load_model(local_path) .. note:: Consistency issue between XGBoost4J-Spark and other bindings From 69ed50fb4be22d54eec1cccd84b21b9d6e5e35df Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Wed, 1 Aug 2018 15:20:01 -0700 Subject: [PATCH 22/25] update doc --- doc/jvm/xgboost4j_spark_tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index 84ded5a8511f..8c7c638f91f5 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -292,11 +292,11 @@ When interacting with other language bindings, XGBoost also supports saving-mode However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, we recommend the following steps to pass models with HDFS, S3, etc. (taking HDFS as an example) -* create a new file with **val outputStream = fs.create("hdfs_path")** +* create a new file with **val outputStream = fs.create("hdfs_path")** where fs is an instance of org.apache.hadoop.fs.FileSystem class in Hadoop. * pass the returned OutputStream in the first step to **xgbClassificationModel.nativeBooster.saveModel(outputStream)** -* download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost: +* download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost (where download_from_hdfs is a helper function implemented by the user): .. code-block:: python From 164ac48757d07638a6149ac734045a90ae5ab065 Mon Sep 17 00:00:00 2001 From: Philip Cho Date: Wed, 1 Aug 2018 15:59:50 -0700 Subject: [PATCH 23/25] Wrap HDFS/S3 export support as a note --- doc/jvm/xgboost4j_spark_tutorial.rst | 36 +++++++++++++++++++--------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index 8c7c638f91f5..933e0ab87ad8 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -286,24 +286,38 @@ Then we can load this model with single node Python XGBoost: bst = xgb.Booster({'nthread': 4}) bst.load_model(nativeModelPath) -When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one: +.. note:: Using HDFS and S3 for exporting the models with nativeBooster.saveModel() -* You can build XGBoost4J-Spark with the steps described in `here `_, but turning `USE_HDFS `_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path. + When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following: -However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, we recommend the following steps to pass models with HDFS, S3, etc. (taking HDFS as an example) + 1. Build XGBoost4J-Spark with the steps described in `here `_, but turning `USE_HDFS `_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path. -* create a new file with **val outputStream = fs.create("hdfs_path")** where fs is an instance of org.apache.hadoop.fs.FileSystem class in Hadoop. + - However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option. -* pass the returned OutputStream in the first step to **xgbClassificationModel.nativeBooster.saveModel(outputStream)** + 2. Use bindings of HDFS, S3, etc. to pass model files around. Here are the steps (taking HDFS as an example): -* download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost (where download_from_hdfs is a helper function implemented by the user): + - Create a new file with -.. code-block:: python + .. code-block:: scala - import xgboost as xgb - bst = xgb.Booster({'nthread': 4}) - local_path = download_from_hdfs("hdfs_path") - bst.load_model(local_path) + val outputStream = fs.create("hdfs_path") + + where "fs" is an instance of `org.apache.hadoop.fs.FileSystem `_ class in Hadoop. + + - Pass the returned OutputStream in the first step to nativeBooster.saveModel(): + + .. code-block:: scala + + xgbClassificationModel.nativeBooster.saveModel(outputStream) + + - Download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost. (The function "download_from_hdfs" is a helper function to be implemented by the user) + + .. code-block:: python + + import xgboost as xgb + bst = xgb.Booster({'nthread': 4}) + local_path = download_from_hdfs("hdfs_path") + bst.load_model(local_path) .. note:: Consistency issue between XGBoost4J-Spark and other bindings From 2664034a27653f3009577f723af5844d5244eb33 Mon Sep 17 00:00:00 2001 From: Nan Zhu Date: Fri, 3 Aug 2018 09:32:09 -0700 Subject: [PATCH 24/25] update --- doc/jvm/xgboost4j_spark_tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index 933e0ab87ad8..1195a3638f54 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -329,7 +329,7 @@ Then we can load this model with single node Python XGBoost: spark.read.format("libsvm").load("trainingset_libsvm") - Spark assumes that the dataset is using 1-based indexing (feature indices staring with 1). However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is using 0-based indexing (feature indices starting with 0). It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based indexing before you predict with, for example, Python API. + Spark assumes that the dataset is using 1-based indexing (feature indices staring with 1). However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is using 0-based indexing (feature indices starting with 0) by default. It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based indexing before you predict with, for example, Python API, or you load "?indexing_mode=1" in your file path when loading with , e.g. **xgb.DMatrix(test.libsvm?indexing_mode=1)**. ******************************************* Building a ML Pipeline with XGBoost4J-Spark From 9ee52f249e97276860915343a255bde9aef00064 Mon Sep 17 00:00:00 2001 From: Philip Cho Date: Fri, 3 Aug 2018 13:08:49 -0700 Subject: [PATCH 25/25] wrap indexing_mode example in code block --- doc/jvm/xgboost4j_spark_tutorial.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index 1195a3638f54..8d679a9563e1 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -329,7 +329,11 @@ Then we can load this model with single node Python XGBoost: spark.read.format("libsvm").load("trainingset_libsvm") - Spark assumes that the dataset is using 1-based indexing (feature indices staring with 1). However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is using 0-based indexing (feature indices starting with 0) by default. It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based indexing before you predict with, for example, Python API, or you load "?indexing_mode=1" in your file path when loading with , e.g. **xgb.DMatrix(test.libsvm?indexing_mode=1)**. + Spark assumes that the dataset is using 1-based indexing (feature indices staring with 1). However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is using 0-based indexing (feature indices starting with 0) by default. It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost. The solution is to transform the dataset to 0-based indexing before you predict with, for example, Python API, or you append ``?indexing_mode=1`` to your file path when loading with DMatirx. For example in Python: + + .. code-block:: python + + xgb.DMatrix('test.libsvm?indexing_mode=1') ******************************************* Building a ML Pipeline with XGBoost4J-Spark