From 0fc1162f17a952bcbd3b930dd1f250178ae40ee5 Mon Sep 17 00:00:00 2001 From: Mathew Wicks Date: Sun, 20 Oct 2019 16:48:26 +1100 Subject: [PATCH] [jvm-packages] initial pyspark api --- .../src/main/resources/ml/__init__.py | 0 .../src/main/resources/ml/dmlc/__init__.py | 0 .../resources/ml/dmlc/xgboost4j/__init__.py | 0 .../ml/dmlc/xgboost4j/scala/__init__.py | 0 .../ml/dmlc/xgboost4j/scala/spark/__init__.py | 22 ++ .../src/main/resources/setup.py | 57 +++ .../src/main/resources/sparkxgb/__init__.py | 20 ++ .../src/main/resources/sparkxgb/common.py | 85 +++++ .../src/main/resources/sparkxgb/util.py | 40 +++ .../src/main/resources/sparkxgb/xgboost.py | 326 ++++++++++++++++++ 10 files changed, 550 insertions(+) create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/ml/__init__.py create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/__init__.py create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/__init__.py create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/scala/__init__.py create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/scala/spark/__init__.py create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/setup.py create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/__init__.py create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/common.py create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/util.py create mode 100644 jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/xgboost.py diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/ml/__init__.py b/jvm-packages/xgboost4j-spark/src/main/resources/ml/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/__init__.py b/jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/__init__.py b/jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/scala/__init__.py b/jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/scala/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/scala/spark/__init__.py b/jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/scala/spark/__init__.py new file mode 100644 index 000000000000..d64ecaefecab --- /dev/null +++ b/jvm-packages/xgboost4j-spark/src/main/resources/ml/dmlc/xgboost4j/scala/spark/__init__.py @@ -0,0 +1,22 @@ +# +# Copyright (c) 2019 by Contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import sys + +from sparkxgb import xgboost + +# Allows Pipeline()/PipelineModel() with XGBoost stages to be loaded from disk. +# Needed because they try to import Python objects from their Java location. +sys.modules['ml.dmlc.xgboost4j.scala.spark'] = xgboost diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/setup.py b/jvm-packages/xgboost4j-spark/src/main/resources/setup.py new file mode 100644 index 000000000000..a320a58ab764 --- /dev/null +++ b/jvm-packages/xgboost4j-spark/src/main/resources/setup.py @@ -0,0 +1,57 @@ +# +# Copyright (c) 2019 by Contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from codecs import open +from os import path +from setuptools import setup, find_packages + +# Read the long description from README.MD +here = path.abspath(path.dirname(__file__)) +with open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name='spark-xgboost', + version='0.90', + description='spark-xgboost is the PySpark package for XGBoost', + + long_description=long_description, + long_description_content_type='text/markdown', + url='https://xgboost.ai/', + author='DMLC', + classifiers=[ + # Project Maturity + 'Development Status :: 5 - Production/Stable', + + # Intended Users + 'Intended Audience :: Developers', + 'Topic :: Software Development :: Build Tools', + + # License + 'License :: OSI Approved :: Apache Software License', + + # Supported Python Versions + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + ], + keywords='development spark xgboost', + + packages=find_packages(), + include_package_data=False +) diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/__init__.py b/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/__init__.py new file mode 100644 index 000000000000..a633c635fcea --- /dev/null +++ b/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/__init__.py @@ -0,0 +1,20 @@ +# +# Copyright (c) 2019 by Contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from sparkxgb import xgboost +from sparkxgb.xgboost import XGBoostClassifier, XGBoostRegressor, XGBoostClassificationModel, XGBoostRegressionModel + +__all__ = ["XGBoostClassifier", "XGBoostRegressor", "XGBoostClassificationModel", "XGBoostRegressionModel"] +__version__ = "0.90" diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/common.py b/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/common.py new file mode 100644 index 000000000000..a1d5fbccd6d5 --- /dev/null +++ b/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/common.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2019 by Contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import re + +from pyspark.ml.param import Params +from pyspark.ml.util import JavaMLWritable +from pyspark.ml.wrapper import JavaModel, JavaEstimator + +from sparkxgb.util import XGBoostReadable + + +class ParamGettersSetters(Params): + """ + Mixin class used to generate the setters/getters for all params. + """ + + def _create_param_getters_and_setters(self): + for param in self.params: + param_name = param.name + fg_attr = "get" + re.sub(r"(?:^|_)(.)", lambda m: m.group(1).upper(), param_name) + fs_attr = "set" + re.sub(r"(?:^|_)(.)", lambda m: m.group(1).upper(), param_name) + # Generates getter and setter only if not exists + try: + getattr(self, fg_attr) + except AttributeError: + setattr(self, fg_attr, self._get_param_value(param_name)) + try: + getattr(self, fs_attr) + except AttributeError: + setattr(self, fs_attr, self._set_param_value(param_name)) + + def _get_param_value(self, param_name): + def r(): + try: + return self.getOrDefault(param_name) + except KeyError: + return None + return r + + def _set_param_value(self, param_name): + def r(v): + self.set(self.getParam(param_name), v) + return self + return r + + +class XGboostEstimator(JavaEstimator, XGBoostReadable, JavaMLWritable, ParamGettersSetters): + """ + Mixin class for XGBoost estimators, like XGBoostClassifier and XGBoostRegressor. + """ + + def __init__(self, classname): + super(XGboostEstimator, self).__init__() + self.__class__._java_class_name = classname + self._java_obj = self._new_java_obj(classname, self.uid) + self._create_params_from_java() + self._create_param_getters_and_setters() + + +class XGboostModel(JavaModel, XGBoostReadable, JavaMLWritable, ParamGettersSetters): + """ + Mixin class for XGBoost models, like XGBoostClassificationModel and XGBoostRegressionModel. + """ + + def __init__(self, classname, java_model=None): + super(XGboostModel, self).__init__(java_model=java_model) + if classname and not java_model: + self.__class__._java_class_name = classname + self._java_obj = self._new_java_obj(classname, self.uid) + if java_model is not None: + self._transfer_params_from_java() + self._create_param_getters_and_setters() diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/util.py b/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/util.py new file mode 100644 index 000000000000..f1c49195e5e3 --- /dev/null +++ b/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/util.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2019 by Contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from pyspark.ml.util import JavaMLReadable, JavaMLReader + + +class XGBoostReadable(JavaMLReadable): + """ + Mixin class that provides a read() method for XGBoostReader. + """ + + @classmethod + def read(cls): + """Returns an XGBoostReader instance for this class.""" + return XGBoostReader(cls) + + +class XGBoostReader(JavaMLReader): + """ + A reader mixin class for XGBoost objects. + """ + + @classmethod + def _java_loader_class(cls, clazz): + if hasattr(clazz, '_java_class_name') and clazz._java_class_name is not None: + return clazz._java_class_name + else: + return JavaMLReader._java_loader_class(clazz) diff --git a/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/xgboost.py b/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/xgboost.py new file mode 100644 index 000000000000..091699ad387d --- /dev/null +++ b/jvm-packages/xgboost4j-spark/src/main/resources/sparkxgb/xgboost.py @@ -0,0 +1,326 @@ +# +# Copyright (c) 2019 by Contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from pyspark import keyword_only + +from sparkxgb.common import XGboostEstimator, XGboostModel + + +class XGBoostClassifier(XGboostEstimator): + """ + A PySpark wrapper of ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier + """ + + @keyword_only + def __init__(self, + alpha=0.0, + baseMarginCol=None, + baseScore=0.5, + cacheTrainingSet=False, + checkpointInterval=-1, + checkpointPath="", + colsampleBylevel=1.0, + colsampleBytree=1.0, + contribPredictionCol=None, + ## EXCLUDED: customEval=None, + ## EXCLUDED: customObj=None, + eta=0.3, + evalMetric=None, + featuresCol="features", + gamma=0.0, + growPolicy="depthwise", + interactionConstraints=None, + labelCol="label", + lambda_=1.0, # Rename of 'lambda' param, as this is a reserved keyword in python. + lambdaBias=0.0, + leafPredictionCol=None, + maxBins=16, + maxDeltaStep=0.0, + maxDepth=6, + maxLeaves=None, + maximizeEvaluationMetrics=None, + minChildWeight=1.0, + missing=float('nan'), + monotoneConstraints=None, + normalizeType="tree", + nthread=1, + numClass=None, + numEarlyStoppingRounds=0, + numRound=1, + numWorkers=1, + objective="reg:squarederror", + objectiveType=None, + predictionCol="prediction", + probabilityCol="probability", + rateDrop=0.0, + rawPredictionCol="rawPrediction", + sampleType="uniform", + scalePosWeight=1.0, + seed=0, + silent=0, + sketchEps=0.03, + skipDrop=0.0, + subsample=1.0, + thresholds=None, + timeoutRequestWorkers=1800000, + ## EXCLUDED: trackerConf=None, + trainTestRatio=1.0, + treeLimit=0, + treeMethod="auto", + useExternalMemory=False, + verbosity=1, + weightCol=None): + super(XGBoostClassifier, self).__init__(classname="ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier") + kwargs = self._input_kwargs + if "lambda_" in kwargs: + kwargs["lambda"] = kwargs.pop("lambda_") + self.setParams(**kwargs) + + @keyword_only + def setParams(self, + alpha=0.0, + baseMarginCol=None, + baseScore=0.5, + cacheTrainingSet=False, + checkpointInterval=-1, + checkpointPath="", + colsampleBylevel=1.0, + colsampleBytree=1.0, + contribPredictionCol=None, + ## EXCLUDED: customEval=None, + ## EXCLUDED: customObj=None, + eta=0.3, + evalMetric=None, + featuresCol="features", + gamma=0.0, + growPolicy="depthwise", + interactionConstraints=None, + labelCol="label", + lambda_=1.0, # Rename of 'lambda' param, as this is a reserved keyword in python. + lambdaBias=0.0, + leafPredictionCol=None, + maxBins=16, + maxDeltaStep=0.0, + maxDepth=6, + maxLeaves=None, + maximizeEvaluationMetrics=None, + minChildWeight=1.0, + missing=float('nan'), + monotoneConstraints=None, + normalizeType="tree", + nthread=1, + numClass=None, + numEarlyStoppingRounds=0, + numRound=1, + numWorkers=1, + objective="reg:squarederror", + objectiveType=None, + predictionCol="prediction", + probabilityCol="probability", + rateDrop=0.0, + rawPredictionCol="rawPrediction", + sampleType="uniform", + scalePosWeight=1.0, + seed=0, + silent=0, + sketchEps=0.03, + skipDrop=0.0, + subsample=1.0, + thresholds=None, + timeoutRequestWorkers=1800000, + ## EXCLUDED: trackerConf=None, + trainTestRatio=1.0, + treeLimit=0, + treeMethod="auto", + useExternalMemory=False, + verbosity=1, + weightCol=None): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def _create_model(self, java_model): + return XGBoostClassificationModel(java_model=java_model) + + +class XGBoostClassificationModel(XGboostModel): + """ + A PySpark wrapper of ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel + """ + + def __init__(self, classname="ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel", java_model=None): + super(XGBoostClassificationModel, self).__init__(classname=classname, java_model=java_model) + + @property + def nativeBooster(self): + """ + Get the native booster instance of this model. + This is used to call low-level APIs on native booster, such as "getFeatureScore". + """ + return self._call_java("nativeBooster") + + +class XGBoostRegressor(XGboostEstimator): + """ + A PySpark wrapper of ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor + """ + + @keyword_only + def __init__(self, + alpha=0.0, + baseMarginCol=None, + baseScore=0.5, + cacheTrainingSet=False, + checkpointInterval=-1, + checkpointPath="", + colsampleBylevel=1.0, + colsampleBytree=1.0, + contribPredictionCol=None, + ## EXCLUDED: customEval=None, + ## EXCLUDED: customObj=None, + eta=0.3, + evalMetric=None, + featuresCol="features", + gamma=0.0, + groupCol=None, + growPolicy="depthwise", + interactionConstraints=None, + labelCol="label", + lambda_=1.0, # Rename of 'lambda' param, as this is a reserved keyword in python. + lambdaBias=0.0, + leafPredictionCol=None, + maxBins=16, + maxDeltaStep=0.0, + maxDepth=6, + maxLeaves=None, + maximizeEvaluationMetrics=None, + minChildWeight=1.0, + missing=float('nan'), + monotoneConstraints=None, + normalizeType="tree", + nthread=1, + numClass=None, + numEarlyStoppingRounds=0, + numRound=1, + numWorkers=1, + objective="reg:squarederror", + objectiveType=None, + predictionCol="prediction", + probabilityCol="probability", + rateDrop=0.0, + rawPredictionCol="rawPrediction", + sampleType="uniform", + scalePosWeight=1.0, + seed=0, + silent=0, + sketchEps=0.03, + skipDrop=0.0, + subsample=1.0, + thresholds=None, + timeoutRequestWorkers=1800000, + ## EXCLUDED: trackerConf=None, + trainTestRatio=1.0, + treeLimit=0, + treeMethod="auto", + useExternalMemory=False, + verbosity=1, + weightCol=None): + super(XGBoostRegressor, self).__init__(classname="ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor") + kwargs = self._input_kwargs + if "lambda_" in kwargs: + kwargs["lambda"] = kwargs.pop("lambda_") + self.setParams(**kwargs) + + @keyword_only + def setParams(self, + alpha=0.0, + baseMarginCol=None, + baseScore=0.5, + cacheTrainingSet=False, + checkpointInterval=-1, + checkpointPath="", + colsampleBylevel=1.0, + colsampleBytree=1.0, + contribPredictionCol=None, + ## EXCLUDED: customEval=None, + ## EXCLUDED: customObj=None, + eta=0.3, + evalMetric=None, + featuresCol="features", + gamma=0.0, + groupCol=None, + growPolicy="depthwise", + interactionConstraints=None, + labelCol="label", + lambda_=1.0, # Rename of 'lambda' param, as this is a reserved keyword in python. + lambdaBias=0.0, + leafPredictionCol=None, + maxBins=16, + maxDeltaStep=0.0, + maxDepth=6, + maxLeaves=None, + maximizeEvaluationMetrics=None, + minChildWeight=1.0, + missing=float('nan'), + monotoneConstraints=None, + normalizeType="tree", + nthread=1, + numClass=None, + numEarlyStoppingRounds=0, + numRound=1, + numWorkers=1, + objective="reg:squarederror", + objectiveType=None, + predictionCol="prediction", + probabilityCol="probability", + rateDrop=0.0, + rawPredictionCol="rawPrediction", + sampleType="uniform", + scalePosWeight=1.0, + seed=0, + silent=0, + sketchEps=0.03, + skipDrop=0.0, + subsample=1.0, + thresholds=None, + timeoutRequestWorkers=1800000, + ## EXCLUDED: trackerConf=None, + trainTestRatio=1.0, + treeLimit=0, + treeMethod="auto", + useExternalMemory=False, + verbosity=1, + weightCol=None): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def _create_model(self, java_model): + return XGBoostRegressionModel(java_model=java_model) + + +class XGBoostRegressionModel(XGboostModel): + """ + A PySpark wrapper of ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel + """ + + def __init__(self, classname="ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel", java_model=None): + super(XGBoostRegressionModel, self).__init__(classname=classname, java_model=java_model) + + @property + def nativeBooster(self): + """ + Get the native booster instance of this model. + This is used to call low-level APIs on native booster, such as "getFeatureScore". + """ + return self._call_java("nativeBooster")