From 93a43bc5007acbc9c55a3eaf591f2b16df614c68 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Sat, 16 Apr 2016 12:27:44 -0700 Subject: [PATCH 1/3] supporting avgMetrics in CrossValidatorModel with Python --- python/pyspark/ml/tests.py | 2 ++ python/pyspark/ml/tuning.py | 16 ++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index a7a9868baccb..dd427743534a 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -534,6 +534,8 @@ def test_save_load(self): cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) + for index in range(len(loadedModel.avgMetrics)): + self.assertTrue(abs(loadedModel.avgMetrics[index] - cvModel.avgMetrics[index]) < 0.0001) class TrainValidationSplitTests(PySparkTestCase): diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 5ac539eddea5..7084dc165ba0 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -266,7 +266,7 @@ def _fit(self, dataset): else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) - return self._copyValues(CrossValidatorModel(bestModel)) + return self._copyValues(CrossValidatorModel(bestModel, metrics)) @since("1.4.0") def copy(self, extra=None): @@ -346,10 +346,11 @@ class CrossValidatorModel(Model, ValidatorParams, MLReadable, MLWritable): .. versionadded:: 1.4.0 """ - def __init__(self, bestModel): + def __init__(self, bestModel, avgMetrics): super(CrossValidatorModel, self).__init__() #: best model from cross validation self.bestModel = bestModel + self.avgMetrics = avgMetrics def _transform(self, dataset): return self.bestModel.transform(dataset) @@ -367,7 +368,9 @@ def copy(self, extra=None): """ if extra is None: extra = dict() - return CrossValidatorModel(self.bestModel.copy(extra)) + bestModel = self.bestModel.copy(extra) + avgMetrics = [am.copy(extra) for am in self.avgMetrics] + return CrossValidatorModel(bestModel, avgMetrics) @since("2.0.0") def write(self): @@ -394,9 +397,10 @@ def _from_java(cls, java_stage): # Load information from java_stage to the instance. bestModel = JavaParams._from_java(java_stage.bestModel()) + avgMetrics = [am for am in java_stage.avgMetrics()] estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. - py_stage = cls(bestModel=bestModel)\ + py_stage = cls(bestModel=bestModel, avgMetrics=avgMetrics)\ .setEstimator(estimator).setEstimatorParamMaps(epms).setEvaluator(evaluator) py_stage._resetUid(java_stage.uid()) return py_stage @@ -408,12 +412,12 @@ def _to_java(self): :return: Java object equivalent to this instance. """ - sc = SparkContext._active_spark_context + java_avgMetrics = [avgMetric for avgMetric in self.avgMetrics] _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), - _py2java(sc, [])) + java_avgMetrics) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) From 25959e546e3efe6e6b1c16c805f16e2045bc440f Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Wed, 27 Apr 2016 05:06:55 -0700 Subject: [PATCH 2/3] address comment - update metrics to list of floats - use `numpy.testing.assert_almost_equal` to assert float list - test CrossValidator and CrossValidatorModel copy --- python/pyspark/ml/tests.py | 27 +++++++++++++++++++++++++-- python/pyspark/ml/tuning.py | 10 ++++------ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index dd427743534a..4244b7f80bec 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -41,6 +41,7 @@ from shutil import rmtree import tempfile import numpy as np +from numpy.testing import assert_almost_equal from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, OneVsRest @@ -461,6 +462,29 @@ def _fit(self, dataset): class CrossValidatorTests(PySparkTestCase): + def test_copy(self): + sqlContext = SQLContext(self.sc) + dataset = sqlContext.createDataFrame([ + (10, 10.0), + (50, 50.0), + (100, 100.0), + (500, 500.0)] * 10, + ["feature", "label"]) + + iee = InducedErrorEstimator() + evaluator = RegressionEvaluator(metricName="rmse") + + grid = (ParamGridBuilder() + .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) + .build()) + cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) + cvCopied = cv.copy() + self.assertEqual(cv.getEstimator().uid, cvCopied.getEstimator().uid) + + cvModel = cv.fit(dataset) + cvModelCopied = cvModel.copy() + assert_almost_equal(cvModel.avgMetrics, cvModelCopied.avgMetrics) + def test_fit_minimize_metric(self): sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame([ @@ -534,8 +558,7 @@ def test_save_load(self): cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) - for index in range(len(loadedModel.avgMetrics)): - self.assertTrue(abs(loadedModel.avgMetrics[index] - cvModel.avgMetrics[index]) < 0.0001) + assert_almost_equal(loadedModel.avgMetrics, cvModel.avgMetrics) class TrainValidationSplitTests(PySparkTestCase): diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 7084dc165ba0..1b2ffd7cb6b0 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -248,7 +248,7 @@ def _fit(self, dataset): h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) - metrics = np.zeros(numModels) + metrics = [0.0] * numModels for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h @@ -369,7 +369,7 @@ def copy(self, extra=None): if extra is None: extra = dict() bestModel = self.bestModel.copy(extra) - avgMetrics = [am.copy(extra) for am in self.avgMetrics] + avgMetrics = self.avgMetrics return CrossValidatorModel(bestModel, avgMetrics) @since("2.0.0") @@ -397,7 +397,7 @@ def _from_java(cls, java_stage): # Load information from java_stage to the instance. bestModel = JavaParams._from_java(java_stage.bestModel()) - avgMetrics = [am for am in java_stage.avgMetrics()] + avgMetrics = list(java_stage.avgMetrics()) estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. py_stage = cls(bestModel=bestModel, avgMetrics=avgMetrics)\ @@ -412,12 +412,10 @@ def _to_java(self): :return: Java object equivalent to this instance. """ - java_avgMetrics = [avgMetric for avgMetric in self.avgMetrics] - _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), - java_avgMetrics) + self.avgMetrics) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) From 51b412fb455b6271460cf274cc45de73a1aec885 Mon Sep 17 00:00:00 2001 From: Kai Jiang Date: Thu, 28 Apr 2016 04:39:12 -0700 Subject: [PATCH 3/3] address comment --- python/pyspark/ml/tests.py | 8 +++++--- python/pyspark/ml/tuning.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 4244b7f80bec..e014233b1b02 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -41,7 +41,6 @@ from shutil import rmtree import tempfile import numpy as np -from numpy.testing import assert_almost_equal from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, OneVsRest @@ -483,7 +482,9 @@ def test_copy(self): cvModel = cv.fit(dataset) cvModelCopied = cvModel.copy() - assert_almost_equal(cvModel.avgMetrics, cvModelCopied.avgMetrics) + for index in range(len(cvModel.avgMetrics)): + self.assertTrue(abs(cvModel.avgMetrics[index] - cvModelCopied.avgMetrics[index]) + < 0.0001) def test_fit_minimize_metric(self): sqlContext = SQLContext(self.sc) @@ -558,7 +559,8 @@ def test_save_load(self): cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid) - assert_almost_equal(loadedModel.avgMetrics, cvModel.avgMetrics) + for index in range(len(loadedModel.avgMetrics)): + self.assertTrue(abs(loadedModel.avgMetrics[index] - cvModel.avgMetrics[index]) < 0.0001) class TrainValidationSplitTests(PySparkTestCase): diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 1b2ffd7cb6b0..043be3535c86 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -346,7 +346,7 @@ class CrossValidatorModel(Model, ValidatorParams, MLReadable, MLWritable): .. versionadded:: 1.4.0 """ - def __init__(self, bestModel, avgMetrics): + def __init__(self, bestModel, avgMetrics=[]): super(CrossValidatorModel, self).__init__() #: best model from cross validation self.bestModel = bestModel