From eaeed35ba0ee22132afd88e51d8808bb8defb122 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 6 May 2015 21:08:23 -0700 Subject: [PATCH 01/17] update Identifiable --- .../apache/spark/ml/util/Identifiable.scala | 26 ++++++++-- .../spark/ml/util/IdentifiableSuite.scala | 50 +++++++++++++++++++ 2 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala index 8a56748ab0a0..3f00a6a04b54 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala @@ -19,15 +19,31 @@ package org.apache.spark.ml.util import java.util.UUID + /** - * Object with a unique id. + * Object with a unique ID that identifies itself and its derivatives. */ private[ml] trait Identifiable extends Serializable { /** - * A unique id for the object. The default implementation concatenates the class name, "_", and 8 - * random hex chars. + * A unique ID for the object and its derivatives. The default implementation concatenates + * [[simpleClassName]], "_", and 8 random hex chars. + */ + final def uid: String = _uid + + /** + * A simple name of the class, which is used as the first part of the generated UID. The default + * implementation uses [[java.lang.Class#getSimpleName()]]. */ - private[ml] val uid: String = - this.getClass.getSimpleName + "_" + UUID.randomUUID().toString.take(8) + protected def simpleClassName: String = this.getClass.getSimpleName + + /** + * Sets the UID of the instance. + */ + protected final def setUID(uid: String): this.type = { + this._uid = uid + this + } + + private var _uid = simpleClassName + "_" + UUID.randomUUID().toString.take(8) } diff --git a/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala b/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala new file mode 100644 index 000000000000..7c222c11b7f2 --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.util + +import org.scalatest.FunSuite + +class IdentifiableSuite extends FunSuite { + + import IdentifiableSuite._ + + test("Identifiable") { + val test0 = new Test0 + assert(test0.uid.startsWith(classOf[Test0].getSimpleName + "_")) + + val test1 = new Test1 + assert(test1.uid.startsWith("test_"), + "simpleClassName should be the first part of the generated UID.") + val copied = test1.copy + assert(copied.uid === test1.uid, "Copied objects should be able to use the same UID.") + } +} + +object IdentifiableSuite { + + class Test0 extends Identifiable + + class Test1 extends Identifiable { + + override def simpleClassName: String = "test" + + def copy: Test1 = { + new Test1().setUID(uid) + } + } +} From 8726d39d3a6ff3a4285b80661b2cf4c48b830508 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 6 May 2015 21:39:40 -0700 Subject: [PATCH 02/17] use parent uid in Param --- .../org/apache/spark/ml/param/params.scala | 74 +++++++++++-------- 1 file changed, 45 insertions(+), 29 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 51ce19d29cd2..f76a54823db5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -39,10 +39,10 @@ import org.apache.spark.ml.util.Identifiable * @tparam T param value type */ @AlphaComponent -class Param[T] (val parent: Params, val name: String, val doc: String, val isValid: T => Boolean) +class Param[T] (val parent: String, val name: String, val doc: String, val isValid: T => Boolean) extends Serializable { - def this(parent: Params, name: String, doc: String) = + def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue[T]) /** @@ -59,8 +59,7 @@ class Param[T] (val parent: Params, val name: String, val doc: String, val isVal */ private[param] def validate(value: T): Unit = { if (!isValid(value)) { - throw new IllegalArgumentException(s"$parent parameter $name given invalid value $value." + - s" Parameter description: $toString") + throw new IllegalArgumentException(s"$parent parameter $name given invalid value $value.") } } @@ -74,19 +73,15 @@ class Param[T] (val parent: Params, val name: String, val doc: String, val isVal */ def ->(value: T): ParamPair[T] = ParamPair(this, value) - /** - * Converts this param's name, doc, and optionally its default value and the user-supplied - * value in its parent to string. - */ - override def toString: String = { - val valueStr = if (parent.isDefined(this)) { - val defaultValueStr = parent.getDefault(this).map("default: " + _) - val currentValueStr = parent.get(this).map("current: " + _) - (defaultValueStr ++ currentValueStr).mkString("(", ", ", ")") - } else { - "(undefined)" + override final def toString: String = "${parent}__$name" + + override final def hashCode: Int = toString.## + + override final def equals(obj: Any): Boolean = { + obj match { + case p: Param[_] => (p.parent == parent) && (p.name == name) + case _ => false } - s"$name: $doc $valueStr" } } @@ -172,47 +167,47 @@ object ParamValidators { // specialize primitive-typed params because Java doesn't recognize scala.Double, scala.Int, ... /** Specialized version of [[Param[Double]]] for Java. */ -class DoubleParam(parent: Params, name: String, doc: String, isValid: Double => Boolean) +class DoubleParam(parent: String, name: String, doc: String, isValid: Double => Boolean) extends Param[Double](parent, name, doc, isValid) { - def this(parent: Params, name: String, doc: String) = + def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue) override def w(value: Double): ParamPair[Double] = super.w(value) } /** Specialized version of [[Param[Int]]] for Java. */ -class IntParam(parent: Params, name: String, doc: String, isValid: Int => Boolean) +class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolean) extends Param[Int](parent, name, doc, isValid) { - def this(parent: Params, name: String, doc: String) = + def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue) override def w(value: Int): ParamPair[Int] = super.w(value) } /** Specialized version of [[Param[Float]]] for Java. */ -class FloatParam(parent: Params, name: String, doc: String, isValid: Float => Boolean) +class FloatParam(parent: String, name: String, doc: String, isValid: Float => Boolean) extends Param[Float](parent, name, doc, isValid) { - def this(parent: Params, name: String, doc: String) = + def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue) override def w(value: Float): ParamPair[Float] = super.w(value) } /** Specialized version of [[Param[Long]]] for Java. */ -class LongParam(parent: Params, name: String, doc: String, isValid: Long => Boolean) +class LongParam(parent: String, name: String, doc: String, isValid: Long => Boolean) extends Param[Long](parent, name, doc, isValid) { - def this(parent: Params, name: String, doc: String) = + def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue) override def w(value: Long): ParamPair[Long] = super.w(value) } /** Specialized version of [[Param[Boolean]]] for Java. */ -class BooleanParam(parent: Params, name: String, doc: String) // No need for isValid +class BooleanParam(parent: String, name: String, doc: String) // No need for isValid extends Param[Boolean](parent, name, doc) { override def w(value: Boolean): ParamPair[Boolean] = super.w(value) @@ -278,9 +273,30 @@ trait Params extends Identifiable with Serializable { } /** - * Returns the documentation of all params. + * Explains a param. + * @param param input param, must belong to this instance. + * @return a string that contains the input param name, doc, and optionally its default value and + * the user-supplied value + */ + def explainParam(param: Param[_]): String = { + shouldOwn(param) + val valueStr = if (isDefined(param)) { + val defaultValueStr = getDefault(param).map("default: " + _) + val currentValueStr = get(param).map("current: " + _) + (defaultValueStr ++ currentValueStr).mkString("(", ", ", ")") + } else { + "(undefined)" + } + s"${param.name}: ${param.doc} $valueStr" + } + + /** + * Explains all params of this instance. + * @see [[explainParam()]] */ - def explainParams(): String = params.mkString("\n") + def explainParams(): String = { + params.map(explainParam).mkString("\n") + } /** Checks whether a param is explicitly set. */ final def isSet(param: Param[_]): Boolean = { @@ -432,7 +448,7 @@ trait Params extends Identifiable with Serializable { /** Validates that the input param belongs to this instance. */ private def shouldOwn(param: Param[_]): Unit = { - require(param.parent.eq(this), s"Param $param does not belong to $this.") + require(param.parent == uid, s"Param $param does not belong to $this.") } /** @@ -548,7 +564,7 @@ final class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any]) override def toString: String = { map.toSeq.sortBy(_._1.name).map { case (param, value) => - s"\t${param.parent.uid}-${param.name}: $value" + s"\t${param.parent}-${param.name}: $value" }.mkString("{\n", ",\n", "\n}") } From 108937eb5501801387137b15ec8d7003d4d717b5 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Thu, 7 May 2015 13:16:01 -0700 Subject: [PATCH 03/17] pass compile --- .../scala/org/apache/spark/ml/Model.scala | 7 +++- .../scala/org/apache/spark/ml/Pipeline.scala | 13 ++++--- .../scala/org/apache/spark/ml/Predictor.scala | 2 +- .../DecisionTreeClassifier.scala | 12 +++--- .../ml/classification/GBTClassifier.scala | 14 ++++--- .../classification/LogisticRegression.scala | 11 ++++-- .../RandomForestClassifier.scala | 12 +++--- .../BinaryClassificationEvaluator.scala | 9 +++-- .../apache/spark/ml/feature/Binarizer.scala | 9 +++-- .../apache/spark/ml/feature/HashingTF.scala | 7 +++- .../org/apache/spark/ml/feature/IDF.scala | 10 +++-- .../apache/spark/ml/feature/Normalizer.scala | 7 +++- .../spark/ml/feature/OneHotEncoder.scala | 8 ++-- .../ml/feature/PolynomialExpansion.scala | 8 +++- .../spark/ml/feature/StandardScaler.scala | 10 +++-- .../spark/ml/feature/StringIndexer.scala | 10 +++-- .../apache/spark/ml/feature/Tokenizer.scala | 10 ++++- .../spark/ml/feature/VectorAssembler.scala | 6 ++- .../spark/ml/feature/VectorIndexer.scala | 13 +++++-- .../apache/spark/ml/feature/Word2Vec.scala | 10 +++-- .../org/apache/spark/ml/param/params.scala | 37 ++++++++++++++++--- .../ml/param/shared/SharedParamsCodeGen.scala | 2 +- .../spark/ml/param/shared/sharedParams.scala | 34 ++++++++--------- .../apache/spark/ml/recommendation/ALS.scala | 29 +++++++++------ .../ml/regression/DecisionTreeRegressor.scala | 12 +++--- .../spark/ml/regression/GBTRegressor.scala | 14 ++++--- .../ml/regression/LinearRegression.scala | 15 +++++--- .../ml/regression/RandomForestRegressor.scala | 12 +++--- .../spark/ml/tuning/CrossValidator.scala | 18 +++++---- .../apache/spark/ml/util/Identifiable.scala | 25 +++++-------- 30 files changed, 244 insertions(+), 142 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala index 9974efe7b1d2..f310db3128b2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Model.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Model.scala @@ -32,7 +32,12 @@ abstract class Model[M <: Model[M]] extends Transformer { * The parent estimator that produced this model. * Note: For ensembles' component Models, this value can be null. */ - val parent: Estimator[M] + var parent: Estimator[M] = _ + + def setParent(parent: Estimator[M]): M = { + this.parent = parent + this.asInstanceOf[M] + } override def copy(extra: ParamMap): M = { // The default implementation of Params.copy doesn't work for models. diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index 33d430f5671e..fc5a76b01949 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -22,6 +22,7 @@ import scala.collection.mutable.ListBuffer import org.apache.spark.Logging import org.apache.spark.annotation.{AlphaComponent, DeveloperApi} import org.apache.spark.ml.param.{Param, ParamMap, Params} +import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType @@ -80,13 +81,15 @@ abstract class PipelineStage extends Params with Logging { * an identity transformer. */ @AlphaComponent -class Pipeline extends Estimator[PipelineModel] { +class Pipeline(override val uid: String) extends Estimator[PipelineModel] { + + def this() = this(Identifiable.randomUID("pipeline")) /** * param for pipeline stages * @group param */ - val stages: Param[Array[PipelineStage]] = new Param(this, "stages", "stages of the pipeline") + val stages: Param[Array[PipelineStage]] = new Param(uid, "stages", "stages of the pipeline") /** @group setParam */ def setStages(value: Array[PipelineStage]): this.type = { set(stages, value); this } @@ -148,7 +151,7 @@ class Pipeline extends Estimator[PipelineModel] { } } - new PipelineModel(this, transformers.toArray) + new PipelineModel(uid, transformers.toArray).setParent(this) } override def copy(extra: ParamMap): Pipeline = { @@ -171,7 +174,7 @@ class Pipeline extends Estimator[PipelineModel] { */ @AlphaComponent class PipelineModel private[ml] ( - override val parent: Pipeline, + val uid: String, val stages: Array[Transformer]) extends Model[PipelineModel] with Logging { @@ -190,6 +193,6 @@ class PipelineModel private[ml] ( } override def copy(extra: ParamMap): PipelineModel = { - new PipelineModel(parent, stages) + new PipelineModel(uid, stages) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala index 0e53877de92d..4e5c6b602419 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Predictor.scala @@ -88,7 +88,7 @@ abstract class Predictor[ // This handles a few items such as schema validation. // Developers only need to implement train(). transformSchema(dataset.schema, logging = true) - copyValues(train(dataset)) + copyValues(train(dataset).setParent(this)) } override def copy(extra: ParamMap): Learner = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index dcebea1d4b01..bc3ba7851ade 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tree.{TreeClassifierParams, DecisionTreeParams, DecisionTreeModel, Node} -import org.apache.spark.ml.util.MetadataUtils +import org.apache.spark.ml.util.{Identifiable, MetadataUtils} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree} @@ -39,10 +39,12 @@ import org.apache.spark.sql.DataFrame * features. */ @AlphaComponent -final class DecisionTreeClassifier +final class DecisionTreeClassifier(override val uid: String) extends Predictor[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel] with DecisionTreeParams with TreeClassifierParams { + def this() = this(Identifiable.randomUID("dtc")) + // Override parameter setters from parent trait for Java API compatibility. override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value) @@ -101,7 +103,7 @@ object DecisionTreeClassifier { */ @AlphaComponent final class DecisionTreeClassificationModel private[ml] ( - override val parent: DecisionTreeClassifier, + override val uid: String, override val rootNode: Node) extends PredictionModel[Vector, DecisionTreeClassificationModel] with DecisionTreeModel with Serializable { @@ -114,7 +116,7 @@ final class DecisionTreeClassificationModel private[ml] ( } override def copy(extra: ParamMap): DecisionTreeClassificationModel = { - copyValues(new DecisionTreeClassificationModel(parent, rootNode), extra) + copyValues(new DecisionTreeClassificationModel(uid, rootNode), extra) } override def toString: String = { @@ -138,6 +140,6 @@ private[ml] object DecisionTreeClassificationModel { s"Cannot convert non-classification DecisionTreeModel (old API) to" + s" DecisionTreeClassificationModel (new API). Algo is: ${oldModel.algo}") val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures) - new DecisionTreeClassificationModel(parent, rootNode) + new DecisionTreeClassificationModel(parent.uid, rootNode) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index ae51b05a0c42..1cecada8c21f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -25,7 +25,7 @@ import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.regression.DecisionTreeRegressionModel import org.apache.spark.ml.tree.{GBTParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel} -import org.apache.spark.ml.util.MetadataUtils +import org.apache.spark.ml.util.{Identifiable, MetadataUtils} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTrees => OldGBT} @@ -44,10 +44,12 @@ import org.apache.spark.sql.DataFrame * Note: Multiclass labels are not currently supported. */ @AlphaComponent -final class GBTClassifier +final class GBTClassifier(override val uid: String) extends Predictor[Vector, GBTClassifier, GBTClassificationModel] with GBTParams with TreeClassifierParams with Logging { + def this() = this(Identifiable.randomUID("gbtc")) + // Override parameter setters from parent trait for Java API compatibility. // Parameters from TreeClassifierParams: @@ -99,7 +101,7 @@ final class GBTClassifier * (default = logistic) * @group param */ - val lossType: Param[String] = new Param[String](this, "lossType", "Loss function which GBT" + + val lossType: Param[String] = new Param[String](uid, "lossType", "Loss function which GBT" + " tries to minimize (case-insensitive). Supported options:" + s" ${GBTClassifier.supportedLossTypes.mkString(", ")}", (value: String) => GBTClassifier.supportedLossTypes.contains(value.toLowerCase)) @@ -160,7 +162,7 @@ object GBTClassifier { */ @AlphaComponent final class GBTClassificationModel( - override val parent: GBTClassifier, + override val uid: String, private val _trees: Array[DecisionTreeRegressionModel], private val _treeWeights: Array[Double]) extends PredictionModel[Vector, GBTClassificationModel] @@ -184,7 +186,7 @@ final class GBTClassificationModel( } override def copy(extra: ParamMap): GBTClassificationModel = { - copyValues(new GBTClassificationModel(parent, _trees, _treeWeights), extra) + copyValues(new GBTClassificationModel(uid, _trees, _treeWeights), extra) } override def toString: String = { @@ -210,6 +212,6 @@ private[ml] object GBTClassificationModel { // parent, fittingParamMap for each tree is null since there are no good ways to set these. DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures) } - new GBTClassificationModel(parent, newTrees, oldModel.treeWeights) + new GBTClassificationModel(parent.uid, newTrees, oldModel.treeWeights) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 550369d18cfe..d1ef5de84a5d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.linalg._ import org.apache.spark.sql.DataFrame @@ -41,10 +42,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas * Currently, this class only supports binary classification. */ @AlphaComponent -class LogisticRegression +class LogisticRegression(override val uid: String) extends ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel] with LogisticRegressionParams { + def this() = this(Identifiable.randomUID("logreg")) + /** @group setParam */ def setRegParam(value: Double): this.type = set(regParam, value) @@ -72,7 +75,7 @@ class LogisticRegression .setRegParam($(regParam)) .setNumIterations($(maxIter)) val oldModel = lr.run(oldDataset) - val lrm = new LogisticRegressionModel(this, oldModel.weights, oldModel.intercept) + val lrm = new LogisticRegressionModel(uid, oldModel.weights, oldModel.intercept) if (handlePersistence) { oldDataset.unpersist() @@ -89,7 +92,7 @@ class LogisticRegression */ @AlphaComponent class LogisticRegressionModel private[ml] ( - override val parent: LogisticRegression, + override val uid: String, val weights: Vector, val intercept: Double) extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel] @@ -140,7 +143,7 @@ class LogisticRegressionModel private[ml] ( } override def copy(extra: ParamMap): LogisticRegressionModel = { - copyValues(new LogisticRegressionModel(parent, weights, intercept), extra) + copyValues(new LogisticRegressionModel(uid, weights, intercept), extra) } override protected def raw2prediction(rawPrediction: Vector): Double = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 9954893f1435..825a98f6425b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -23,7 +23,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tree.{RandomForestParams, TreeClassifierParams, DecisionTreeModel, TreeEnsembleModel} -import org.apache.spark.ml.util.MetadataUtils +import org.apache.spark.ml.util.{Identifiable, MetadataUtils} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{RandomForest => OldRandomForest} @@ -41,10 +41,12 @@ import org.apache.spark.sql.DataFrame * features. */ @AlphaComponent -final class RandomForestClassifier +final class RandomForestClassifier(override val uid: String) extends Predictor[Vector, RandomForestClassifier, RandomForestClassificationModel] with RandomForestParams with TreeClassifierParams { + def this() = this(Identifiable.randomUID("rfc")) + // Override parameter setters from parent trait for Java API compatibility. // Parameters from TreeClassifierParams: @@ -118,7 +120,7 @@ object RandomForestClassifier { */ @AlphaComponent final class RandomForestClassificationModel private[ml] ( - override val parent: RandomForestClassifier, + override val uid: String, private val _trees: Array[DecisionTreeClassificationModel]) extends PredictionModel[Vector, RandomForestClassificationModel] with TreeEnsembleModel with Serializable { @@ -146,7 +148,7 @@ final class RandomForestClassificationModel private[ml] ( } override def copy(extra: ParamMap): RandomForestClassificationModel = { - copyValues(new RandomForestClassificationModel(parent, _trees), extra) + copyValues(new RandomForestClassificationModel(uid, _trees), extra) } override def toString: String = { @@ -172,6 +174,6 @@ private[ml] object RandomForestClassificationModel { // parent, fittingParamMap for each tree is null since there are no good ways to set these. DecisionTreeClassificationModel.fromOld(tree, null, categoricalFeatures) } - new RandomForestClassificationModel(parent, newTrees) + new RandomForestClassificationModel(parent.uid, newTrees) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala index e5a73c6087a1..b27a711fff48 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.Evaluator import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} @@ -33,13 +33,16 @@ import org.apache.spark.sql.types.DoubleType * Evaluator for binary classification, which expects two input columns: score and label. */ @AlphaComponent -class BinaryClassificationEvaluator extends Evaluator with HasRawPredictionCol with HasLabelCol { +class BinaryClassificationEvaluator(override val uid: String) + extends Evaluator with HasRawPredictionCol with HasLabelCol { + + def this() = this(Identifiable.randomUID("binEval")) /** * param for metric name in evaluation * @group param */ - val metricName: Param[String] = new Param(this, "metricName", + val metricName: Param[String] = new Param(uid, "metricName", "metric name in evaluation (areaUnderROC|areaUnderPR)") /** @group getParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index 6eb1db697111..67ad98016d86 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -22,7 +22,7 @@ import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} @@ -32,7 +32,10 @@ import org.apache.spark.sql.types.{DoubleType, StructType} * Binarize a column of continuous features given a threshold. */ @AlphaComponent -final class Binarizer extends Transformer with HasInputCol with HasOutputCol { +final class Binarizer(override val uid: String) + extends Transformer with HasInputCol with HasOutputCol { + + def this() = this(Identifiable.randomUID("binarizer")) /** * Param for threshold used to binarize continuous features. @@ -41,7 +44,7 @@ final class Binarizer extends Transformer with HasInputCol with HasOutputCol { * @group param */ val threshold: DoubleParam = - new DoubleParam(this, "threshold", "threshold used to binarize continuous features") + new DoubleParam(uid, "threshold", "threshold used to binarize continuous features") /** @group getParam */ def getThreshold: Double = $(threshold) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index c305a819a896..664cd9413dbb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.{IntParam, ParamValidators} +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.types.DataType @@ -29,14 +30,16 @@ import org.apache.spark.sql.types.DataType * Maps a sequence of terms to their term frequencies using the hashing trick. */ @AlphaComponent -class HashingTF extends UnaryTransformer[Iterable[_], Vector, HashingTF] { +class HashingTF(override val uid: String) extends UnaryTransformer[Iterable[_], Vector, HashingTF] { + + def this() = this(Identifiable.randomUID("hashingTF")) /** * Number of features. Should be > 0. * (default = 2^18^) * @group param */ - val numFeatures = new IntParam(this, "numFeatures", "number of features (> 0)", + val numFeatures = new IntParam(uid, "numFeatures", "number of features (> 0)", ParamValidators.gt(0)) setDefault(numFeatures -> (1 << 18)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala index d901a20aed00..788c392050c2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ @@ -62,7 +62,9 @@ private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol * Compute the Inverse Document Frequency (IDF) given a collection of documents. */ @AlphaComponent -final class IDF extends Estimator[IDFModel] with IDFBase { +final class IDF(override val uid: String) extends Estimator[IDFModel] with IDFBase { + + def this() = this(Identifiable.randomUID("idf")) /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -74,7 +76,7 @@ final class IDF extends Estimator[IDFModel] with IDFBase { transformSchema(dataset.schema, logging = true) val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v } val idf = new feature.IDF($(minDocFreq)).fit(input) - copyValues(new IDFModel(this, idf)) + copyValues(new IDFModel(uid, idf).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -88,7 +90,7 @@ final class IDF extends Estimator[IDFModel] with IDFBase { */ @AlphaComponent class IDFModel private[ml] ( - override val parent: IDF, + override val uid: String, idfModel: feature.IDFModel) extends Model[IDFModel] with IDFBase { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala index 755b46a64c7f..fd0363ec06b0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.{DoubleParam, ParamValidators} +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.types.DataType @@ -29,14 +30,16 @@ import org.apache.spark.sql.types.DataType * Normalize a vector to have unit norm using the given p-norm. */ @AlphaComponent -class Normalizer extends UnaryTransformer[Vector, Vector, Normalizer] { +class Normalizer(override val uid: String) extends UnaryTransformer[Vector, Vector, Normalizer] { + + def this() = this(Identifiable.randomUID("normalizer")) /** * Normalization in L^p^ space. Must be >= 1. * (default: p = 2) * @group param */ - val p = new DoubleParam(this, "p", "the p norm value", ParamValidators.gtEq(1)) + val p = new DoubleParam(uid, "p", "the p norm value", ParamValidators.gtEq(1)) setDefault(p -> 2.0) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala index 46514ae5f0e8..1fb9b9ae7509 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala @@ -24,7 +24,7 @@ import org.apache.spark.ml.attribute.{Attribute, BinaryAttribute, NominalAttribu import org.apache.spark.mllib.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.types.{DataType, DoubleType, StructType} /** @@ -37,8 +37,10 @@ import org.apache.spark.sql.types.{DataType, DoubleType, StructType} * linearly dependent because they sum up to one. */ @AlphaComponent -class OneHotEncoder extends UnaryTransformer[Double, Vector, OneHotEncoder] - with HasInputCol with HasOutputCol { +class OneHotEncoder(override val uid: String) + extends UnaryTransformer[Double, Vector, OneHotEncoder] with HasInputCol with HasOutputCol { + + def this() = this(Identifiable.randomUID("oneHot")) /** * Whether to include a component in the encoded vectors for the first category, defaults to true. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index 63e190c8aae5..e8951701c790 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -22,6 +22,7 @@ import scala.collection.mutable import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.{IntParam, ParamValidators} +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg._ import org.apache.spark.sql.types.DataType @@ -34,14 +35,17 @@ import org.apache.spark.sql.types.DataType * `(x, y)`, if we want to expand it with degree 2, then we get `(x, y, x * x, x * y, y * y)`. */ @AlphaComponent -class PolynomialExpansion extends UnaryTransformer[Vector, Vector, PolynomialExpansion] { +class PolynomialExpansion(override val uid: String) + extends UnaryTransformer[Vector, Vector, PolynomialExpansion] { + + def this() = this(Identifiable.randomUID("poly")) /** * The polynomial degree to expand, which should be >= 1. A value of 1 means no expansion. * Default: 2 * @group param */ - val degree = new IntParam(this, "degree", "the polynomial degree to expand (>= 1)", + val degree = new IntParam(uid, "degree", "the polynomial degree to expand (>= 1)", ParamValidators.gt(1)) setDefault(degree -> 2) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 7cad59ff3fa3..5ccda15d872e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -21,6 +21,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ @@ -55,7 +56,10 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with * statistics on the samples in the training set. */ @AlphaComponent -class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerParams { +class StandardScaler(override val uid: String) extends Estimator[StandardScalerModel] + with StandardScalerParams { + + def this() = this(Identifiable.randomUID("stdScal")) setDefault(withMean -> false, withStd -> true) @@ -76,7 +80,7 @@ class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerP val input = dataset.select($(inputCol)).map { case Row(v: Vector) => v } val scaler = new feature.StandardScaler(withMean = $(withMean), withStd = $(withStd)) val scalerModel = scaler.fit(input) - copyValues(new StandardScalerModel(this, scalerModel)) + copyValues(new StandardScalerModel(uid, scalerModel).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -96,7 +100,7 @@ class StandardScaler extends Estimator[StandardScalerModel] with StandardScalerP */ @AlphaComponent class StandardScalerModel private[ml] ( - override val parent: StandardScaler, + override val uid: String, scaler: feature.StandardScalerModel) extends Model[StandardScalerModel] with StandardScalerParams { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 3d78537ad84c..63055050b892 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.annotation.AlphaComponent +import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ @@ -58,7 +59,10 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha * So the most frequent label gets index 0. */ @AlphaComponent -class StringIndexer extends Estimator[StringIndexerModel] with StringIndexerBase { +class StringIndexer(override val uid: String) extends Estimator[StringIndexerModel] + with StringIndexerBase { + + def this() = this(Identifiable.randomUID("strIdx")) /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -73,7 +77,7 @@ class StringIndexer extends Estimator[StringIndexerModel] with StringIndexerBase .map(_.getString(0)) .countByValue() val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray - copyValues(new StringIndexerModel(this, labels)) + copyValues(new StringIndexerModel(uid, labels).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -87,7 +91,7 @@ class StringIndexer extends Estimator[StringIndexerModel] with StringIndexerBase */ @AlphaComponent class StringIndexerModel private[ml] ( - override val parent: StringIndexer, + override val uid: String, labels: Array[String]) extends Model[StringIndexerModel] with StringIndexerBase { private val labelToIndex: OpenHashMap[String, Double] = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 2863b7621526..9fae3e7a7c87 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{ArrayType, DataType, StringType} /** @@ -27,7 +28,9 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} * A tokenizer that converts the input string to lowercase and then splits it by white spaces. */ @AlphaComponent -class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { +class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[String], Tokenizer] { + + def this() = this(Identifiable.randomUID("tok")) override protected def createTransformFunc: String => Seq[String] = { _.toLowerCase.split("\\s") @@ -48,7 +51,10 @@ class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] { * It returns an array of strings that can be empty. */ @AlphaComponent -class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenizer] { +class RegexTokenizer(override val uid: String) + extends UnaryTransformer[String, Seq[String], RegexTokenizer] { + + def this() = this(Identifiable.randomUID("regexTok")) /** * Minimum token length, >= 0. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala index 8f2e62a8e208..f44f04dd50ec 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala @@ -23,6 +23,7 @@ import org.apache.spark.SparkException import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ @@ -33,7 +34,10 @@ import org.apache.spark.sql.types._ * A feature transformer than merge multiple columns into a vector column. */ @AlphaComponent -class VectorAssembler extends Transformer with HasInputCols with HasOutputCol { +class VectorAssembler(override val uid: String) + extends Transformer with HasInputCols with HasOutputCol { + + def this() = this(Identifiable.randomUID("va")) /** @group setParam */ def setInputCols(value: Array[String]): this.type = set(inputCols, value) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala index 07ea579d6989..25eb9843810a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala @@ -22,7 +22,7 @@ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, BinaryAttribute, NominalAttribute, NumericAttribute} import org.apache.spark.ml.param.{IntParam, ParamValidators, Params} import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.callUDF @@ -87,7 +87,10 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu * - Add option for allowing unknown categories. */ @AlphaComponent -class VectorIndexer extends Estimator[VectorIndexerModel] with VectorIndexerParams { +class VectorIndexer(override val uid: String) extends Estimator[VectorIndexerModel] + with VectorIndexerParams { + + def this() = this(Identifiable.randomUID("vecIdx")) /** @group setParam */ def setMaxCategories(value: Int): this.type = set(maxCategories, value) @@ -110,7 +113,9 @@ class VectorIndexer extends Estimator[VectorIndexerModel] with VectorIndexerPara iter.foreach(localCatStats.addVector) Iterator(localCatStats) }.reduce((stats1, stats2) => stats1.merge(stats2)) - copyValues(new VectorIndexerModel(this, numFeatures, categoryStats.getCategoryMaps)) + val model = new VectorIndexerModel(uid, numFeatures, categoryStats.getCategoryMaps) + .setParent(this) + copyValues(model) } override def transformSchema(schema: StructType): StructType = { @@ -236,7 +241,7 @@ private object VectorIndexer { */ @AlphaComponent class VectorIndexerModel private[ml] ( - override val parent: VectorIndexer, + override val uid: String, val numFeatures: Int, val categoryMaps: Map[Int, Map[Double, Int]]) extends Model[VectorIndexerModel] with VectorIndexerParams { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 34ff92970129..8ace8c53bb66 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} import org.apache.spark.mllib.linalg.BLAS._ @@ -85,7 +85,9 @@ private[feature] trait Word2VecBase extends Params * natural language processing or machine learning process. */ @AlphaComponent -final class Word2Vec extends Estimator[Word2VecModel] with Word2VecBase { +final class Word2Vec(override val uid: String) extends Estimator[Word2VecModel] with Word2VecBase { + + def this() = this(Identifiable.randomUID("w2v")) /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -122,7 +124,7 @@ final class Word2Vec extends Estimator[Word2VecModel] with Word2VecBase { .setSeed($(seed)) .setVectorSize($(vectorSize)) .fit(input) - copyValues(new Word2VecModel(this, wordVectors)) + copyValues(new Word2VecModel(uid, wordVectors).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -136,7 +138,7 @@ final class Word2Vec extends Estimator[Word2VecModel] with Word2VecBase { */ @AlphaComponent class Word2VecModel private[ml] ( - override val parent: Word2Vec, + override val uid: String, wordVectors: feature.Word2VecModel) extends Model[Word2VecModel] with Word2VecBase { diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index f76a54823db5..cee029579011 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -39,12 +39,17 @@ import org.apache.spark.ml.util.Identifiable * @tparam T param value type */ @AlphaComponent -class Param[T] (val parent: String, val name: String, val doc: String, val isValid: T => Boolean) +class Param[T](val parent: String, val name: String, val doc: String, val isValid: T => Boolean) extends Serializable { + def this(parent: Identifiable, name: String, doc: String, isValid: T => Boolean) = + this(parent.uid, name, doc, isValid) + def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue[T]) + def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc) + /** * Assert that the given value is valid for this parameter. * @@ -173,6 +178,11 @@ class DoubleParam(parent: String, name: String, doc: String, isValid: Double => def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue) + def this(parent: Identifiable, name: String, doc: String, isValid: Double => Boolean) = + this(parent.uid, name, doc, isValid) + + def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc) + override def w(value: Double): ParamPair[Double] = super.w(value) } @@ -183,6 +193,11 @@ class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolea def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue) + def this(parent: Identifiable, name: String, doc: String, isValid: Int => Boolean) = + this(parent.uid, name, doc, isValid) + + def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc) + override def w(value: Int): ParamPair[Int] = super.w(value) } @@ -193,6 +208,11 @@ class FloatParam(parent: String, name: String, doc: String, isValid: Float => Bo def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue) + def this(parent: Identifiable, name: String, doc: String, isValid: Float => Boolean) = + this(parent.uid, name, doc, isValid) + + def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc) + override def w(value: Float): ParamPair[Float] = super.w(value) } @@ -203,6 +223,11 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool def this(parent: String, name: String, doc: String) = this(parent, name, doc, ParamValidators.alwaysTrue) + def this(parent: Identifiable, name: String, doc: String, isValid: Long => Boolean) = + this(parent.uid, name, doc, isValid) + + def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc) + override def w(value: Long): ParamPair[Long] = super.w(value) } @@ -210,6 +235,8 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool class BooleanParam(parent: String, name: String, doc: String) // No need for isValid extends Param[Boolean](parent, name, doc) { + def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc) + override def w(value: Boolean): ParamPair[Boolean] = super.w(value) } @@ -413,13 +440,13 @@ trait Params extends Identifiable with Serializable { } /** - * Creates a copy of this instance with a randomly generated uid and some extra params. - * The default implementation calls the default constructor to create a new instance, then - * copies the embedded and extra parameters over and returns the new instance. + * Creates a copy of this instance with the same UID and some extra params. + * The default implementation tries to create a new instance with the same UID. + * Then it copies the embedded and extra parameters over and returns the new instance. * Subclasses should override this method if the default approach is not sufficient. */ def copy(extra: ParamMap): Params = { - val that = this.getClass.newInstance() + val that = this.getClass.getConstructor(classOf[String]).newInstance(uid) copyValues(that, extra) that } diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index 0e1ff97a8bf6..68d4c5ab2bd9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -139,7 +139,7 @@ private[shared] object SharedParamsCodeGen { | * Param for $doc. | * @group param | */ - | final val $name: $Param = new $Param(this, "$name", "$doc"$isValid) + | final val $name: $Param = new $Param(uid, "$name", "$doc"$isValid) |$setDefault | /** @group getParam */ | final def get$Name: $T = $$($name) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index 87f86807c3c9..519dd44578c1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -33,7 +33,7 @@ private[ml] trait HasRegParam extends Params { * Param for regularization parameter (>= 0). * @group param */ - final val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter (>= 0)", ParamValidators.gtEq(0)) + final val regParam: DoubleParam = new DoubleParam(uid, "regParam", "regularization parameter (>= 0)", ParamValidators.gtEq(0)) /** @group getParam */ final def getRegParam: Double = $(regParam) @@ -48,7 +48,7 @@ private[ml] trait HasMaxIter extends Params { * Param for max number of iterations (>= 0). * @group param */ - final val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations (>= 0)", ParamValidators.gtEq(0)) + final val maxIter: IntParam = new IntParam(uid, "maxIter", "max number of iterations (>= 0)", ParamValidators.gtEq(0)) /** @group getParam */ final def getMaxIter: Int = $(maxIter) @@ -63,7 +63,7 @@ private[ml] trait HasFeaturesCol extends Params { * Param for features column name. * @group param */ - final val featuresCol: Param[String] = new Param[String](this, "featuresCol", "features column name") + final val featuresCol: Param[String] = new Param[String](uid, "featuresCol", "features column name") setDefault(featuresCol, "features") @@ -80,7 +80,7 @@ private[ml] trait HasLabelCol extends Params { * Param for label column name. * @group param */ - final val labelCol: Param[String] = new Param[String](this, "labelCol", "label column name") + final val labelCol: Param[String] = new Param[String](uid, "labelCol", "label column name") setDefault(labelCol, "label") @@ -97,7 +97,7 @@ private[ml] trait HasPredictionCol extends Params { * Param for prediction column name. * @group param */ - final val predictionCol: Param[String] = new Param[String](this, "predictionCol", "prediction column name") + final val predictionCol: Param[String] = new Param[String](uid, "predictionCol", "prediction column name") setDefault(predictionCol, "prediction") @@ -114,7 +114,7 @@ private[ml] trait HasRawPredictionCol extends Params { * Param for raw prediction (a.k.a. confidence) column name. * @group param */ - final val rawPredictionCol: Param[String] = new Param[String](this, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name") + final val rawPredictionCol: Param[String] = new Param[String](uid, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name") setDefault(rawPredictionCol, "rawPrediction") @@ -131,7 +131,7 @@ private[ml] trait HasProbabilityCol extends Params { * Param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.. * @group param */ - final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.") + final val probabilityCol: Param[String] = new Param[String](uid, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.") setDefault(probabilityCol, "probability") @@ -148,7 +148,7 @@ private[ml] trait HasThreshold extends Params { * Param for threshold in binary classification prediction, in range [0, 1]. * @group param */ - final val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in binary classification prediction, in range [0, 1]", ParamValidators.inRange(0, 1)) + final val threshold: DoubleParam = new DoubleParam(uid, "threshold", "threshold in binary classification prediction, in range [0, 1]", ParamValidators.inRange(0, 1)) /** @group getParam */ final def getThreshold: Double = $(threshold) @@ -163,7 +163,7 @@ private[ml] trait HasInputCol extends Params { * Param for input column name. * @group param */ - final val inputCol: Param[String] = new Param[String](this, "inputCol", "input column name") + final val inputCol: Param[String] = new Param[String](uid, "inputCol", "input column name") /** @group getParam */ final def getInputCol: String = $(inputCol) @@ -178,7 +178,7 @@ private[ml] trait HasInputCols extends Params { * Param for input column names. * @group param */ - final val inputCols: Param[Array[String]] = new Param[Array[String]](this, "inputCols", "input column names") + final val inputCols: Param[Array[String]] = new Param[Array[String]](uid, "inputCols", "input column names") /** @group getParam */ final def getInputCols: Array[String] = $(inputCols) @@ -193,7 +193,7 @@ private[ml] trait HasOutputCol extends Params { * Param for output column name. * @group param */ - final val outputCol: Param[String] = new Param[String](this, "outputCol", "output column name") + final val outputCol: Param[String] = new Param[String](uid, "outputCol", "output column name") /** @group getParam */ final def getOutputCol: String = $(outputCol) @@ -208,7 +208,7 @@ private[ml] trait HasCheckpointInterval extends Params { * Param for checkpoint interval (>= 1). * @group param */ - final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "checkpoint interval (>= 1)", ParamValidators.gtEq(1)) + final val checkpointInterval: IntParam = new IntParam(uid, "checkpointInterval", "checkpoint interval (>= 1)", ParamValidators.gtEq(1)) /** @group getParam */ final def getCheckpointInterval: Int = $(checkpointInterval) @@ -223,7 +223,7 @@ private[ml] trait HasFitIntercept extends Params { * Param for whether to fit an intercept term. * @group param */ - final val fitIntercept: BooleanParam = new BooleanParam(this, "fitIntercept", "whether to fit an intercept term") + final val fitIntercept: BooleanParam = new BooleanParam(uid, "fitIntercept", "whether to fit an intercept term") setDefault(fitIntercept, true) @@ -240,7 +240,7 @@ private[ml] trait HasSeed extends Params { * Param for random seed. * @group param */ - final val seed: LongParam = new LongParam(this, "seed", "random seed") + final val seed: LongParam = new LongParam(uid, "seed", "random seed") setDefault(seed, Utils.random.nextLong()) @@ -257,7 +257,7 @@ private[ml] trait HasElasticNetParam extends Params { * Param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.. * @group param */ - final val elasticNetParam: DoubleParam = new DoubleParam(this, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", ParamValidators.inRange(0, 1)) + final val elasticNetParam: DoubleParam = new DoubleParam(uid, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", ParamValidators.inRange(0, 1)) /** @group getParam */ final def getElasticNetParam: Double = $(elasticNetParam) @@ -272,7 +272,7 @@ private[ml] trait HasTol extends Params { * Param for the convergence tolerance for iterative algorithms. * @group param */ - final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms") + final val tol: DoubleParam = new DoubleParam(uid, "tol", "the convergence tolerance for iterative algorithms") /** @group getParam */ final def getTol: Double = $(tol) @@ -287,7 +287,7 @@ private[ml] trait HasStepSize extends Params { * Param for Step size to be used for each iteration of optimization.. * @group param */ - final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size to be used for each iteration of optimization.") + final val stepSize: DoubleParam = new DoubleParam(uid, "stepSize", "Step size to be used for each iteration of optimization.") /** @group getParam */ final def getStepSize: Double = $(stepSize) diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 6cf4b4007528..0c5286ef794c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -35,6 +35,7 @@ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.optimization.NNLS import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame @@ -56,7 +57,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: 10 * @group param */ - val rank = new IntParam(this, "rank", "rank of the factorization", ParamValidators.gtEq(1)) + val rank = new IntParam(uid, "rank", "rank of the factorization", ParamValidators.gtEq(1)) /** @group getParam */ def getRank: Int = $(rank) @@ -66,7 +67,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: 10 * @group param */ - val numUserBlocks = new IntParam(this, "numUserBlocks", "number of user blocks", + val numUserBlocks = new IntParam(uid, "numUserBlocks", "number of user blocks", ParamValidators.gtEq(1)) /** @group getParam */ @@ -77,7 +78,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: 10 * @group param */ - val numItemBlocks = new IntParam(this, "numItemBlocks", "number of item blocks", + val numItemBlocks = new IntParam(uid, "numItemBlocks", "number of item blocks", ParamValidators.gtEq(1)) /** @group getParam */ @@ -88,7 +89,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: false * @group param */ - val implicitPrefs = new BooleanParam(this, "implicitPrefs", "whether to use implicit preference") + val implicitPrefs = new BooleanParam(uid, "implicitPrefs", "whether to use implicit preference") /** @group getParam */ def getImplicitPrefs: Boolean = $(implicitPrefs) @@ -98,7 +99,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: 1.0 * @group param */ - val alpha = new DoubleParam(this, "alpha", "alpha for implicit preference", + val alpha = new DoubleParam(uid, "alpha", "alpha for implicit preference", ParamValidators.gtEq(0)) /** @group getParam */ @@ -109,7 +110,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: "user" * @group param */ - val userCol = new Param[String](this, "userCol", "column name for user ids") + val userCol = new Param[String](uid, "userCol", "column name for user ids") /** @group getParam */ def getUserCol: String = $(userCol) @@ -119,7 +120,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: "item" * @group param */ - val itemCol = new Param[String](this, "itemCol", "column name for item ids") + val itemCol = new Param[String](uid, "itemCol", "column name for item ids") /** @group getParam */ def getItemCol: String = $(itemCol) @@ -129,7 +130,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: "rating" * @group param */ - val ratingCol = new Param[String](this, "ratingCol", "column name for ratings") + val ratingCol = new Param[String](uid, "ratingCol", "column name for ratings") /** @group getParam */ def getRatingCol: String = $(ratingCol) @@ -140,7 +141,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * @group param */ val nonnegative = new BooleanParam( - this, "nonnegative", "whether to use nonnegative constraint for least squares") + uid, "nonnegative", "whether to use nonnegative constraint for least squares") /** @group getParam */ def getNonnegative: Boolean = $(nonnegative) @@ -171,7 +172,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Model fitted by ALS. */ class ALSModel private[ml] ( - override val parent: ALS, + override val uid: String, k: Int, userFactors: RDD[(Int, Array[Float])], itemFactors: RDD[(Int, Array[Float])]) @@ -235,10 +236,12 @@ class ALSModel private[ml] ( * indicated user * preferences rather than explicit ratings given to items. */ -class ALS extends Estimator[ALSModel] with ALSParams { +class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams { import org.apache.spark.ml.recommendation.ALS.Rating + def this() = this(Identifiable.randomUID("als")) + /** @group setParam */ def setRank(value: Int): this.type = set(rank, value) @@ -299,7 +302,9 @@ class ALS extends Estimator[ALSModel] with ALSParams { maxIter = $(maxIter), regParam = $(regParam), implicitPrefs = $(implicitPrefs), alpha = $(alpha), nonnegative = $(nonnegative), checkpointInterval = $(checkpointInterval)) - copyValues(new ALSModel(this, $(rank), userFactors, itemFactors)) + val model = new ALSModel(uid, $(rank), userFactors, itemFactors) + .setParent(this) + copyValues(model) } override def transformSchema(schema: StructType): StructType = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index f8f0b161a481..d71a56cf702a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tree.{TreeRegressorParams, DecisionTreeParams, DecisionTreeModel, Node} -import org.apache.spark.ml.util.MetadataUtils +import org.apache.spark.ml.util.{Identifiable, MetadataUtils} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree} @@ -38,10 +38,12 @@ import org.apache.spark.sql.DataFrame * It supports both continuous and categorical features. */ @AlphaComponent -final class DecisionTreeRegressor +final class DecisionTreeRegressor(override val uid: String) extends Predictor[Vector, DecisionTreeRegressor, DecisionTreeRegressionModel] with DecisionTreeParams with TreeRegressorParams { + def this() = this(Identifiable.randomUID("dtr")) + // Override parameter setters from parent trait for Java API compatibility. override def setMaxDepth(value: Int): this.type = super.setMaxDepth(value) @@ -91,7 +93,7 @@ object DecisionTreeRegressor { */ @AlphaComponent final class DecisionTreeRegressionModel private[ml] ( - override val parent: DecisionTreeRegressor, + override val uid: String, override val rootNode: Node) extends PredictionModel[Vector, DecisionTreeRegressionModel] with DecisionTreeModel with Serializable { @@ -104,7 +106,7 @@ final class DecisionTreeRegressionModel private[ml] ( } override def copy(extra: ParamMap): DecisionTreeRegressionModel = { - copyValues(new DecisionTreeRegressionModel(parent, rootNode), extra) + copyValues(new DecisionTreeRegressionModel(uid, rootNode), extra) } override def toString: String = { @@ -128,6 +130,6 @@ private[ml] object DecisionTreeRegressionModel { s"Cannot convert non-regression DecisionTreeModel (old API) to" + s" DecisionTreeRegressionModel (new API). Algo is: ${oldModel.algo}") val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures) - new DecisionTreeRegressionModel(parent, rootNode) + new DecisionTreeRegressionModel(parent.uid, rootNode) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 461905c12701..11717c412dfc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -24,7 +24,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.tree.{GBTParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel} -import org.apache.spark.ml.util.MetadataUtils +import org.apache.spark.ml.util.{Identifiable, MetadataUtils} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTrees => OldGBT} @@ -42,10 +42,12 @@ import org.apache.spark.sql.DataFrame * It supports both continuous and categorical features. */ @AlphaComponent -final class GBTRegressor +final class GBTRegressor(override val uid: String) extends Predictor[Vector, GBTRegressor, GBTRegressionModel] with GBTParams with TreeRegressorParams with Logging { + def this() = this(Identifiable.randomUID("gbtr")) + // Override parameter setters from parent trait for Java API compatibility. // Parameters from TreeRegressorParams: @@ -97,7 +99,7 @@ final class GBTRegressor * (default = squared) * @group param */ - val lossType: Param[String] = new Param[String](this, "lossType", "Loss function which GBT" + + val lossType: Param[String] = new Param[String](uid, "lossType", "Loss function which GBT" + " tries to minimize (case-insensitive). Supported options:" + s" ${GBTRegressor.supportedLossTypes.mkString(", ")}", (value: String) => GBTRegressor.supportedLossTypes.contains(value.toLowerCase)) @@ -149,7 +151,7 @@ object GBTRegressor { */ @AlphaComponent final class GBTRegressionModel( - override val parent: GBTRegressor, + override val uid: String, private val _trees: Array[DecisionTreeRegressionModel], private val _treeWeights: Array[Double]) extends PredictionModel[Vector, GBTRegressionModel] @@ -173,7 +175,7 @@ final class GBTRegressionModel( } override def copy(extra: ParamMap): GBTRegressionModel = { - copyValues(new GBTRegressionModel(parent, _trees, _treeWeights), extra) + copyValues(new GBTRegressionModel(uid, _trees, _treeWeights), extra) } override def toString: String = { @@ -199,6 +201,6 @@ private[ml] object GBTRegressionModel { // parent, fittingParamMap for each tree is null since there are no good ways to set these. DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures) } - new GBTRegressionModel(parent, newTrees, oldModel.treeWeights) + new GBTRegressionModel(parent.uid, newTrees, oldModel.treeWeights) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index e63c9a3eead5..44ed86962081 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -17,6 +17,8 @@ package org.apache.spark.ml.regression +import org.apache.spark.ml.util.Identifiable + import scala.collection.mutable import breeze.linalg.{DenseVector => BDV, norm => brzNorm} @@ -59,9 +61,12 @@ private[regression] trait LinearRegressionParams extends PredictorParams * - L2 + L1 (elastic net) */ @AlphaComponent -class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel] +class LinearRegression(override val uid: String) + extends Regressor[Vector, LinearRegression, LinearRegressionModel] with LinearRegressionParams with Logging { + def this() = this(Identifiable.randomUID("linReg")) + /** * Set the regularization parameter. * Default is 0.0. @@ -126,7 +131,7 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress logWarning(s"The standard deviation of the label is zero, so the weights will be zeros " + s"and the intercept will be the mean of the label; as a result, training is not needed.") if (handlePersistence) instances.unpersist() - return new LinearRegressionModel(this, Vectors.sparse(numFeatures, Seq()), yMean) + return new LinearRegressionModel(uid, Vectors.sparse(numFeatures, Seq()), yMean) } val featuresMean = summarizer.mean.toArray @@ -179,7 +184,7 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress if (handlePersistence) instances.unpersist() // TODO: Converts to sparse format based on the storage, but may base on the scoring speed. - new LinearRegressionModel(this, weights.compressed, intercept) + copyValues(new LinearRegressionModel(uid, weights.compressed, intercept)) } } @@ -190,7 +195,7 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress */ @AlphaComponent class LinearRegressionModel private[ml] ( - override val parent: LinearRegression, + override val uid: String, val weights: Vector, val intercept: Double) extends RegressionModel[Vector, LinearRegressionModel] @@ -201,7 +206,7 @@ class LinearRegressionModel private[ml] ( } override def copy(extra: ParamMap): LinearRegressionModel = { - copyValues(new LinearRegressionModel(parent, weights, intercept), extra) + copyValues(new LinearRegressionModel(uid, weights, intercept), extra) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala index dbc628927433..82437aa8de29 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/RandomForestRegressor.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tree.{RandomForestParams, TreeRegressorParams, DecisionTreeModel, TreeEnsembleModel} -import org.apache.spark.ml.util.MetadataUtils +import org.apache.spark.ml.util.{Identifiable, MetadataUtils} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{RandomForest => OldRandomForest} @@ -37,10 +37,12 @@ import org.apache.spark.sql.DataFrame * It supports both continuous and categorical features. */ @AlphaComponent -final class RandomForestRegressor +final class RandomForestRegressor(override val uid: String) extends Predictor[Vector, RandomForestRegressor, RandomForestRegressionModel] with RandomForestParams with TreeRegressorParams { + def this() = this(Identifiable.randomUID("rfr")) + // Override parameter setters from parent trait for Java API compatibility. // Parameters from TreeRegressorParams: @@ -105,7 +107,7 @@ object RandomForestRegressor { */ @AlphaComponent final class RandomForestRegressionModel private[ml] ( - override val parent: RandomForestRegressor, + override val uid: String, private val _trees: Array[DecisionTreeRegressionModel]) extends PredictionModel[Vector, RandomForestRegressionModel] with TreeEnsembleModel with Serializable { @@ -128,7 +130,7 @@ final class RandomForestRegressionModel private[ml] ( } override def copy(extra: ParamMap): RandomForestRegressionModel = { - copyValues(new RandomForestRegressionModel(parent, _trees), extra) + copyValues(new RandomForestRegressionModel(uid, _trees), extra) } override def toString: String = { @@ -154,6 +156,6 @@ private[ml] object RandomForestRegressionModel { // parent, fittingParamMap for each tree is null since there are no good ways to set these. DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures) } - new RandomForestRegressionModel(parent, newTrees) + new RandomForestRegressionModel(parent.uid, newTrees) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 9208127eb1d7..4886cdc35a9e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -23,6 +23,7 @@ import org.apache.spark.Logging import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml._ import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType @@ -36,7 +37,7 @@ private[ml] trait CrossValidatorParams extends Params { * param for the estimator to be cross-validated * @group param */ - val estimator: Param[Estimator[_]] = new Param(this, "estimator", "estimator for selection") + val estimator: Param[Estimator[_]] = new Param(uid, "estimator", "estimator for selection") /** @group getParam */ def getEstimator: Estimator[_] = $(estimator) @@ -46,7 +47,7 @@ private[ml] trait CrossValidatorParams extends Params { * @group param */ val estimatorParamMaps: Param[Array[ParamMap]] = - new Param(this, "estimatorParamMaps", "param maps for the estimator") + new Param(uid, "estimatorParamMaps", "param maps for the estimator") /** @group getParam */ def getEstimatorParamMaps: Array[ParamMap] = $(estimatorParamMaps) @@ -56,7 +57,7 @@ private[ml] trait CrossValidatorParams extends Params { * metric * @group param */ - val evaluator: Param[Evaluator] = new Param(this, "evaluator", + val evaluator: Param[Evaluator] = new Param(uid, "evaluator", "evaluator used to select hyper-parameters that maximize the cross-validated metric") /** @group getParam */ @@ -67,7 +68,7 @@ private[ml] trait CrossValidatorParams extends Params { * Default: 3 * @group param */ - val numFolds: IntParam = new IntParam(this, "numFolds", + val numFolds: IntParam = new IntParam(uid, "numFolds", "number of folds for cross validation (>= 2)", ParamValidators.gtEq(2)) /** @group getParam */ @@ -81,7 +82,10 @@ private[ml] trait CrossValidatorParams extends Params { * K-fold cross validation. */ @AlphaComponent -class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorParams with Logging { +class CrossValidator(override val uid: String) extends Estimator[CrossValidatorModel] + with CrossValidatorParams with Logging { + + def this() = this(Identifiable.randomUID("cv")) private val f2jBLAS = new F2jBLAS @@ -136,7 +140,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP logInfo(s"Best set of parameters:\n${epm(bestIndex)}") logInfo(s"Best cross-validation metric: $bestMetric.") val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]] - copyValues(new CrossValidatorModel(this, bestModel)) + copyValues(new CrossValidatorModel(uid, bestModel).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -150,7 +154,7 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP */ @AlphaComponent class CrossValidatorModel private[ml] ( - override val parent: CrossValidator, + override val uid: String, val bestModel: Model[_]) extends Model[CrossValidatorModel] with CrossValidatorParams { diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala index 3f00a6a04b54..146697680092 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/Identifiable.scala @@ -21,29 +21,22 @@ import java.util.UUID /** - * Object with a unique ID that identifies itself and its derivatives. + * Trait for an object with an immutable unique ID that identifies itself and its derivatives. */ -private[ml] trait Identifiable extends Serializable { +trait Identifiable { /** - * A unique ID for the object and its derivatives. The default implementation concatenates - * [[simpleClassName]], "_", and 8 random hex chars. + * An immutable unique ID for the object and its derivatives. */ - final def uid: String = _uid + val uid: String +} - /** - * A simple name of the class, which is used as the first part of the generated UID. The default - * implementation uses [[java.lang.Class#getSimpleName()]]. - */ - protected def simpleClassName: String = this.getClass.getSimpleName +object Identifiable { /** - * Sets the UID of the instance. + * Returns a random UID that concatenates the given prefix, "_", and 12 random hex chars. */ - protected final def setUID(uid: String): this.type = { - this._uid = uid - this + def randomUID(prefix: String): String = { + prefix + "_" + UUID.randomUUID().toString.takeRight(12) } - - private var _uid = simpleClassName + "_" + UUID.randomUUID().toString.take(8) } From fbc39f04dd44897e320cc283b0a0cfa9376f2494 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Thu, 7 May 2015 15:34:02 -0700 Subject: [PATCH 04/17] pass test:compile --- .../JavaLogisticRegressionSuite.java | 4 +-- .../apache/spark/ml/param/JavaTestParams.java | 21 ++++++++++++++- .../regression/JavaLinearRegressionSuite.java | 4 +-- .../spark/ml/util/IdentifiableSuite.scala | 26 ++++++------------- .../DecisionTreeClassifierSuite.scala | 2 +- .../classification/GBTClassifierSuite.scala | 2 +- .../LogisticRegressionSuite.scala | 4 +-- .../RandomForestClassifierSuite.scala | 2 +- .../apache/spark/ml/param/TestParams.scala | 5 +++- .../DecisionTreeRegressorSuite.scala | 2 +- .../ml/regression/GBTRegressorSuite.scala | 3 ++- .../RandomForestRegressorSuite.scala | 2 +- 12 files changed, 45 insertions(+), 32 deletions(-) diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java index 7e7189a2b1d5..f75e024a713e 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaLogisticRegressionSuite.java @@ -84,7 +84,7 @@ public void logisticRegressionWithSetters() { .setThreshold(0.6) .setProbabilityCol("myProbability"); LogisticRegressionModel model = lr.fit(dataset); - LogisticRegression parent = model.parent(); + LogisticRegression parent = (LogisticRegression) model.parent(); assert(parent.getMaxIter() == 10); assert(parent.getRegParam() == 1.0); assert(parent.getThreshold() == 0.6); @@ -110,7 +110,7 @@ public void logisticRegressionWithSetters() { // Call fit() with new params, and check as many params as we can. LogisticRegressionModel model2 = lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1), lr.threshold().w(0.4), lr.probabilityCol().w("theProb")); - LogisticRegression parent2 = model2.parent(); + LogisticRegression parent2 = (LogisticRegression) model2.parent(); assert(parent2.getMaxIter() == 5); assert(parent2.getRegParam() == 0.1); assert(parent2.getThreshold() == 0.4); diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java index 8abe575610d1..fbaaefeb7680 100644 --- a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java +++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java @@ -21,11 +21,30 @@ import com.google.common.collect.Lists; +import org.apache.spark.ml.util.Identifiable$; + /** * A subclass of Params for testing. */ public class JavaTestParams extends JavaParams { + public JavaTestParams() { + this._uid = Identifiable$.MODULE$.randomUID("javaTestParams"); + _init(); + } + + public JavaTestParams(String uid) { + this._uid = uid; + _init(); + } + + private String _uid; + + @Override + public String uid() { + return _uid; + } + public IntParam myIntParam; public int getMyIntParam() { return (Integer)getOrDefault(myIntParam); } @@ -50,7 +69,7 @@ public JavaTestParams setMyStringParam(String value) { set(myStringParam, value); return this; } - public JavaTestParams() { + private void _init() { myIntParam = new IntParam(this, "myIntParam", "this is an int param", ParamValidators.gt(0)); myDoubleParam = new DoubleParam(this, "myDoubleParam", "this is a double param", ParamValidators.inRange(0.0, 1.0)); diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java index a82b86d560b6..d591a456864e 100644 --- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java @@ -77,14 +77,14 @@ public void linearRegressionWithSetters() { .setMaxIter(10) .setRegParam(1.0); LinearRegressionModel model = lr.fit(dataset); - LinearRegression parent = model.parent(); + LinearRegression parent = (LinearRegression) model.parent(); assertEquals(10, parent.getMaxIter()); assertEquals(1.0, parent.getRegParam(), 0.0); // Call fit() with new params, and check as many params as we can. LinearRegressionModel model2 = lr.fit(dataset, lr.maxIter().w(5), lr.regParam().w(0.1), lr.predictionCol().w("thePred")); - LinearRegression parent2 = model2.parent(); + LinearRegression parent2 = (LinearRegression) model2.parent(); assertEquals(5, parent2.getMaxIter()); assertEquals(0.1, parent2.getRegParam(), 0.0); assertEquals("thePred", model2.getPredictionCol()); diff --git a/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala b/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala index 7c222c11b7f2..67c262d0f9d8 100644 --- a/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala +++ b/mllib/src/test/java/org/apache/spark/ml/util/IdentifiableSuite.scala @@ -21,30 +21,20 @@ import org.scalatest.FunSuite class IdentifiableSuite extends FunSuite { - import IdentifiableSuite._ + import IdentifiableSuite.Test test("Identifiable") { - val test0 = new Test0 - assert(test0.uid.startsWith(classOf[Test0].getSimpleName + "_")) - - val test1 = new Test1 - assert(test1.uid.startsWith("test_"), - "simpleClassName should be the first part of the generated UID.") - val copied = test1.copy - assert(copied.uid === test1.uid, "Copied objects should be able to use the same UID.") + val test0 = new Test("test_0") + assert(test0.uid === "test_0") + + val test1 = new Test + assert(test1.uid.startsWith("test_")) } } object IdentifiableSuite { - class Test0 extends Identifiable - - class Test1 extends Identifiable { - - override def simpleClassName: String = "test" - - def copy: Test1 = { - new Test1().setUID(uid) - } + class Test(override val uid: String) extends Identifiable { + def this() = this(Identifiable.randomUID("test")) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala index 03af4ecd7a7e..3fdc66be8a31 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala @@ -268,7 +268,7 @@ private[ml] object DecisionTreeClassifierSuite extends FunSuite { val newTree = dt.fit(newData) // Use parent, fittingParamMap from newTree since these are not checked anyways. val oldTreeAsNew = DecisionTreeClassificationModel.fromOld( - oldTree, newTree.parent, categoricalFeatures) + oldTree, newTree.parent.asInstanceOf[DecisionTreeClassifier], categoricalFeatures) TreeTests.checkEqual(oldTreeAsNew, newTree) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index 16c758b82c7c..ea86867f1161 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -130,7 +130,7 @@ private object GBTClassifierSuite { val newModel = gbt.fit(newData) // Use parent, fittingParamMap from newTree since these are not checked anyways. val oldModelAsNew = GBTClassificationModel.fromOld( - oldModel, newModel.parent, categoricalFeatures) + oldModel, newModel.parent.asInstanceOf[GBTClassifier], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 6dd1fdf05514..b44ca9120f4a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -74,7 +74,7 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext { .setThreshold(0.6) .setProbabilityCol("myProbability") val model = lr.fit(dataset) - val parent = model.parent + val parent = model.parent.asInstanceOf[LogisticRegression] assert(parent.getMaxIter === 10) assert(parent.getRegParam === 1.0) assert(parent.getThreshold === 0.6) @@ -100,7 +100,7 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext { // Call fit() with new params, and check as many params as we can. val model2 = lr.fit(dataset, lr.maxIter -> 5, lr.regParam -> 0.1, lr.threshold -> 0.4, lr.probabilityCol -> "theProb") - val parent2 = model2.parent + val parent2 = model2.parent.asInstanceOf[LogisticRegression] assert(parent2.getMaxIter === 5) assert(parent2.getRegParam === 0.1) assert(parent2.getThreshold === 0.4) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala index c41def933050..08f86fa45bc1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala @@ -160,7 +160,7 @@ private object RandomForestClassifierSuite { val newModel = rf.fit(newData) // Use parent, fittingParamMap from newTree since these are not checked anyways. val oldModelAsNew = RandomForestClassificationModel.fromOld( - oldModel, newModel.parent, categoricalFeatures) + oldModel, newModel.parent.asInstanceOf[RandomForestClassifier], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala index dc1607364040..a9e78366ad98 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/param/TestParams.scala @@ -18,9 +18,12 @@ package org.apache.spark.ml.param import org.apache.spark.ml.param.shared.{HasInputCol, HasMaxIter} +import org.apache.spark.ml.util.Identifiable /** A subclass of Params for testing. */ -class TestParams extends Params with HasMaxIter with HasInputCol { +class TestParams(override val uid: String) extends Params with HasMaxIter with HasInputCol { + + def this() = this(Identifiable.randomUID("testParams")) def setMaxIter(value: Int): this.type = { set(maxIter, value); this } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index 5aa81b44ddaf..1196a772dfdd 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -85,7 +85,7 @@ private[ml] object DecisionTreeRegressorSuite extends FunSuite { val newTree = dt.fit(newData) // Use parent, fittingParamMap from newTree since these are not checked anyways. val oldTreeAsNew = DecisionTreeRegressionModel.fromOld( - oldTree, newTree.parent, categoricalFeatures) + oldTree, newTree.parent.asInstanceOf[DecisionTreeRegressor], categoricalFeatures) TreeTests.checkEqual(oldTreeAsNew, newTree) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala index 25b36ab08b67..40e7e3273e96 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala @@ -130,7 +130,8 @@ private object GBTRegressorSuite { val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val newModel = gbt.fit(newData) // Use parent, fittingParamMap from newTree since these are not checked anyways. - val oldModelAsNew = GBTRegressionModel.fromOld(oldModel, newModel.parent, categoricalFeatures) + val oldModelAsNew = GBTRegressionModel.fromOld( + oldModel, newModel.parent.asInstanceOf[GBTRegressor], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala index 45f09f4fdab8..3efffbb763b7 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala @@ -116,7 +116,7 @@ private object RandomForestRegressorSuite extends FunSuite { val newModel = rf.fit(newData) // Use parent, fittingParamMap from newTree since these are not checked anyways. val oldModelAsNew = RandomForestRegressionModel.fromOld( - oldModel, newModel.parent, categoricalFeatures) + oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) } } From e1160cfceb249db8071181620871a25f7a910a91 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Thu, 7 May 2015 15:45:11 -0700 Subject: [PATCH 05/17] fix tests --- .../spark/ml/classification/DecisionTreeClassifier.scala | 1 + .../org/apache/spark/ml/classification/GBTClassifier.scala | 1 + .../org/apache/spark/ml/regression/DecisionTreeRegressor.scala | 3 ++- .../scala/org/apache/spark/ml/regression/GBTRegressor.scala | 1 + .../src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala | 2 +- 5 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index bc3ba7851ade..d34ffd3990c9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -140,6 +140,7 @@ private[ml] object DecisionTreeClassificationModel { s"Cannot convert non-classification DecisionTreeModel (old API) to" + s" DecisionTreeClassificationModel (new API). Algo is: ${oldModel.algo}") val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures) + val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtc") new DecisionTreeClassificationModel(parent.uid, rootNode) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index 1cecada8c21f..441f1bddc919 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -212,6 +212,7 @@ private[ml] object GBTClassificationModel { // parent, fittingParamMap for each tree is null since there are no good ways to set these. DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures) } + val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtc") new GBTClassificationModel(parent.uid, newTrees, oldModel.treeWeights) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index d71a56cf702a..e67df21b2e4a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -130,6 +130,7 @@ private[ml] object DecisionTreeRegressionModel { s"Cannot convert non-regression DecisionTreeModel (old API) to" + s" DecisionTreeRegressionModel (new API). Algo is: ${oldModel.algo}") val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures) - new DecisionTreeRegressionModel(parent.uid, rootNode) + val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtr") + new DecisionTreeRegressionModel(uid, rootNode) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 11717c412dfc..050ae0f6bc10 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -201,6 +201,7 @@ private[ml] object GBTRegressionModel { // parent, fittingParamMap for each tree is null since there are no good ways to set these. DecisionTreeRegressionModel.fromOld(tree, null, categoricalFeatures) } + val uid = if (parent != null) parent.uid else Identifiable.randomUID("gbtr") new GBTRegressionModel(parent.uid, newTrees, oldModel.treeWeights) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala index 6056e7d3f6ff..e30eab9e005f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala @@ -27,7 +27,7 @@ class ParamsSuite extends FunSuite { assert(maxIter.name === "maxIter") assert(maxIter.doc === "max number of iterations (>= 0)") - assert(maxIter.parent.eq(solver)) + assert(maxIter.parent === solver.uid) assert(maxIter.toString === "maxIter: max number of iterations (>= 0) (default: 10)") assert(!maxIter.isValid(-1)) assert(maxIter.isValid(0)) From c255f17ee3e9e26f751973b6113dc91cfc94defd Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Fri, 8 May 2015 14:34:37 -0700 Subject: [PATCH 06/17] fix tests in ParamsSuite --- .../spark/ml/feature/ElementwiseProduct.scala | 6 +++++- .../scala/org/apache/spark/ml/param/params.scala | 2 +- .../org/apache/spark/ml/param/ParamsSuite.scala | 16 ++++++++++------ 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala index f8b56293e3cc..8b32eee0e490 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala @@ -20,6 +20,7 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.Param +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.types.DataType @@ -31,7 +32,10 @@ import org.apache.spark.sql.types.DataType * multiplier. */ @AlphaComponent -class ElementwiseProduct extends UnaryTransformer[Vector, Vector, ElementwiseProduct] { +class ElementwiseProduct(override val uid: String) + extends UnaryTransformer[Vector, Vector, ElementwiseProduct] { + + def this() = this(Identifiable.randomUID("elemProd")) /** * the vector to multiply with input vectors diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index d3aca30a35e9..689a82779119 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -79,7 +79,7 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali */ def ->(value: T): ParamPair[T] = ParamPair(this, value) - override final def toString: String = "${parent}__$name" + override final def toString: String = s"${parent}__$name" override final def hashCode: Int = toString.## diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala index e30eab9e005f..b96874f3a882 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala @@ -23,21 +23,22 @@ class ParamsSuite extends FunSuite { test("param") { val solver = new TestParams() + val uid = solver.uid import solver.{maxIter, inputCol} assert(maxIter.name === "maxIter") assert(maxIter.doc === "max number of iterations (>= 0)") - assert(maxIter.parent === solver.uid) - assert(maxIter.toString === "maxIter: max number of iterations (>= 0) (default: 10)") + assert(maxIter.parent === uid) + assert(maxIter.toString === s"${uid}__maxIter") assert(!maxIter.isValid(-1)) assert(maxIter.isValid(0)) assert(maxIter.isValid(1)) solver.setMaxIter(5) - assert(maxIter.toString === + assert(solver.explainParam(maxIter) === "maxIter: max number of iterations (>= 0) (default: 10, current: 5)") - assert(inputCol.toString === "inputCol: input column name (undefined)") + assert(inputCol.toString === s"${uid}__inputCol") intercept[IllegalArgumentException] { solver.setMaxIter(-1) @@ -118,7 +119,10 @@ class ParamsSuite extends FunSuite { assert(!solver.isDefined(inputCol)) intercept[NoSuchElementException](solver.getInputCol) - assert(solver.explainParams() === Seq(inputCol, maxIter).mkString("\n")) + assert(solver.explainParam(maxIter) === + "maxIter: max number of iterations (>= 0) (default: 10, current: 100)") + assert(solver.explainParams() === + Seq(inputCol, maxIter).map(solver.explainParam).mkString("\n")) assert(solver.getParam("inputCol").eq(inputCol)) assert(solver.getParam("maxIter").eq(maxIter)) @@ -148,7 +152,7 @@ class ParamsSuite extends FunSuite { assert(!solver.isSet(maxIter)) val copied = solver.copy(ParamMap(solver.maxIter -> 50)) - assert(copied.uid !== solver.uid) + assert(copied.uid === solver.uid) assert(copied.getInputCol === solver.getInputCol) assert(copied.getMaxIter === 50) } From fdbc415bb9e2306df37c215d587ac57f8418b791 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Fri, 8 May 2015 14:59:56 -0700 Subject: [PATCH 07/17] all tests passed --- .../spark/ml/classification/DecisionTreeClassifier.scala | 2 +- .../spark/ml/classification/RandomForestClassifier.scala | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index d34ffd3990c9..7c961332bf5b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -141,6 +141,6 @@ private[ml] object DecisionTreeClassificationModel { s" DecisionTreeClassificationModel (new API). Algo is: ${oldModel.algo}") val rootNode = Node.fromOld(oldModel.topNode, categoricalFeatures) val uid = if (parent != null) parent.uid else Identifiable.randomUID("dtc") - new DecisionTreeClassificationModel(parent.uid, rootNode) + new DecisionTreeClassificationModel(uid, rootNode) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 825a98f6425b..a1de7919859e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -174,6 +174,7 @@ private[ml] object RandomForestClassificationModel { // parent, fittingParamMap for each tree is null since there are no good ways to set these. DecisionTreeClassificationModel.fromOld(tree, null, categoricalFeatures) } - new RandomForestClassificationModel(parent.uid, newTrees) + val uid = if (parent != null) parent.uid else Identifiable.randomUID("rfc") + new RandomForestClassificationModel(uid, newTrees) } } From a4794dd842f82b001daef73dd82016766b6215b9 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Fri, 8 May 2015 15:13:03 -0700 Subject: [PATCH 08/17] change Param to use to reduce the size of diff --- .../scala/org/apache/spark/ml/Pipeline.scala | 2 +- .../ml/classification/GBTClassifier.scala | 2 +- .../BinaryClassificationEvaluator.scala | 2 +- .../apache/spark/ml/feature/Binarizer.scala | 2 +- .../apache/spark/ml/feature/HashingTF.scala | 2 +- .../apache/spark/ml/feature/Normalizer.scala | 2 +- .../ml/feature/PolynomialExpansion.scala | 2 +- .../ml/param/shared/SharedParamsCodeGen.scala | 2 +- .../spark/ml/param/shared/sharedParams.scala | 32 +++++++++---------- .../apache/spark/ml/recommendation/ALS.scala | 18 +++++------ .../spark/ml/regression/GBTRegressor.scala | 2 +- .../ml/regression/LinearRegression.scala | 6 ++-- .../spark/ml/tuning/CrossValidator.scala | 8 ++--- 13 files changed, 40 insertions(+), 42 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index fc5a76b01949..f3368ad2d364 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -89,7 +89,7 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] { * param for pipeline stages * @group param */ - val stages: Param[Array[PipelineStage]] = new Param(uid, "stages", "stages of the pipeline") + val stages: Param[Array[PipelineStage]] = new Param(this, "stages", "stages of the pipeline") /** @group setParam */ def setStages(value: Array[PipelineStage]): this.type = { set(stages, value); this } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala index 441f1bddc919..d504d84beb91 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala @@ -101,7 +101,7 @@ final class GBTClassifier(override val uid: String) * (default = logistic) * @group param */ - val lossType: Param[String] = new Param[String](uid, "lossType", "Loss function which GBT" + + val lossType: Param[String] = new Param[String](this, "lossType", "Loss function which GBT" + " tries to minimize (case-insensitive). Supported options:" + s" ${GBTClassifier.supportedLossTypes.mkString(", ")}", (value: String) => GBTClassifier.supportedLossTypes.contains(value.toLowerCase)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala index b27a711fff48..c1af09c9694b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala @@ -42,7 +42,7 @@ class BinaryClassificationEvaluator(override val uid: String) * param for metric name in evaluation * @group param */ - val metricName: Param[String] = new Param(uid, "metricName", + val metricName: Param[String] = new Param(this, "metricName", "metric name in evaluation (areaUnderROC|areaUnderPR)") /** @group getParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index 67ad98016d86..62f4a6343423 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -44,7 +44,7 @@ final class Binarizer(override val uid: String) * @group param */ val threshold: DoubleParam = - new DoubleParam(uid, "threshold", "threshold used to binarize continuous features") + new DoubleParam(this, "threshold", "threshold used to binarize continuous features") /** @group getParam */ def getThreshold: Double = $(threshold) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala index 664cd9413dbb..30033ced68a0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala @@ -39,7 +39,7 @@ class HashingTF(override val uid: String) extends UnaryTransformer[Iterable[_], * (default = 2^18^) * @group param */ - val numFeatures = new IntParam(uid, "numFeatures", "number of features (> 0)", + val numFeatures = new IntParam(this, "numFeatures", "number of features (> 0)", ParamValidators.gt(0)) setDefault(numFeatures -> (1 << 18)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala index fd0363ec06b0..3f689d1585cd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Normalizer.scala @@ -39,7 +39,7 @@ class Normalizer(override val uid: String) extends UnaryTransformer[Vector, Vect * (default: p = 2) * @group param */ - val p = new DoubleParam(uid, "p", "the p norm value", ParamValidators.gtEq(1)) + val p = new DoubleParam(this, "p", "the p norm value", ParamValidators.gtEq(1)) setDefault(p -> 2.0) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index 4d691a78cda4..41564410e496 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -45,7 +45,7 @@ class PolynomialExpansion(override val uid: String) * Default: 2 * @group param */ - val degree = new IntParam(uid, "degree", "the polynomial degree to expand (>= 1)", + val degree = new IntParam(this, "degree", "the polynomial degree to expand (>= 1)", ParamValidators.gt(1)) setDefault(degree -> 2) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index d1147336e3ed..5085b798daa1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -140,7 +140,7 @@ private[shared] object SharedParamsCodeGen { | * Param for $doc. | * @group param | */ - | final val $name: $Param = new $Param(uid, "$name", "$doc"$isValid) + | final val $name: $Param = new $Param(this, "$name", "$doc"$isValid) |$setDefault | /** @group getParam */ | final def get$Name: $T = $$($name) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index 5758258fc3ba..7525d3700737 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -33,7 +33,7 @@ private[ml] trait HasRegParam extends Params { * Param for regularization parameter (>= 0). * @group param */ - final val regParam: DoubleParam = new DoubleParam(uid, "regParam", "regularization parameter (>= 0)", ParamValidators.gtEq(0)) + final val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter (>= 0)", ParamValidators.gtEq(0)) /** @group getParam */ final def getRegParam: Double = $(regParam) @@ -48,7 +48,7 @@ private[ml] trait HasMaxIter extends Params { * Param for max number of iterations (>= 0). * @group param */ - final val maxIter: IntParam = new IntParam(uid, "maxIter", "max number of iterations (>= 0)", ParamValidators.gtEq(0)) + final val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations (>= 0)", ParamValidators.gtEq(0)) /** @group getParam */ final def getMaxIter: Int = $(maxIter) @@ -63,7 +63,7 @@ private[ml] trait HasFeaturesCol extends Params { * Param for features column name. * @group param */ - final val featuresCol: Param[String] = new Param[String](uid, "featuresCol", "features column name") + final val featuresCol: Param[String] = new Param[String](this, "featuresCol", "features column name") setDefault(featuresCol, "features") @@ -80,7 +80,7 @@ private[ml] trait HasLabelCol extends Params { * Param for label column name. * @group param */ - final val labelCol: Param[String] = new Param[String](uid, "labelCol", "label column name") + final val labelCol: Param[String] = new Param[String](this, "labelCol", "label column name") setDefault(labelCol, "label") @@ -97,7 +97,7 @@ private[ml] trait HasPredictionCol extends Params { * Param for prediction column name. * @group param */ - final val predictionCol: Param[String] = new Param[String](uid, "predictionCol", "prediction column name") + final val predictionCol: Param[String] = new Param[String](this, "predictionCol", "prediction column name") setDefault(predictionCol, "prediction") @@ -114,7 +114,7 @@ private[ml] trait HasRawPredictionCol extends Params { * Param for raw prediction (a.k.a. confidence) column name. * @group param */ - final val rawPredictionCol: Param[String] = new Param[String](uid, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name") + final val rawPredictionCol: Param[String] = new Param[String](this, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name") setDefault(rawPredictionCol, "rawPrediction") @@ -131,7 +131,7 @@ private[ml] trait HasProbabilityCol extends Params { * Param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.. * @group param */ - final val probabilityCol: Param[String] = new Param[String](uid, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.") + final val probabilityCol: Param[String] = new Param[String](this, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.") setDefault(probabilityCol, "probability") @@ -148,7 +148,7 @@ private[ml] trait HasThreshold extends Params { * Param for threshold in binary classification prediction, in range [0, 1]. * @group param */ - final val threshold: DoubleParam = new DoubleParam(uid, "threshold", "threshold in binary classification prediction, in range [0, 1]", ParamValidators.inRange(0, 1)) + final val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in binary classification prediction, in range [0, 1]", ParamValidators.inRange(0, 1)) /** @group getParam */ final def getThreshold: Double = $(threshold) @@ -163,7 +163,7 @@ private[ml] trait HasInputCol extends Params { * Param for input column name. * @group param */ - final val inputCol: Param[String] = new Param[String](uid, "inputCol", "input column name") + final val inputCol: Param[String] = new Param[String](this, "inputCol", "input column name") /** @group getParam */ final def getInputCol: String = $(inputCol) @@ -193,7 +193,7 @@ private[ml] trait HasOutputCol extends Params { * Param for output column name. * @group param */ - final val outputCol: Param[String] = new Param[String](uid, "outputCol", "output column name") + final val outputCol: Param[String] = new Param[String](this, "outputCol", "output column name") /** @group getParam */ final def getOutputCol: String = $(outputCol) @@ -208,7 +208,7 @@ private[ml] trait HasCheckpointInterval extends Params { * Param for checkpoint interval (>= 1). * @group param */ - final val checkpointInterval: IntParam = new IntParam(uid, "checkpointInterval", "checkpoint interval (>= 1)", ParamValidators.gtEq(1)) + final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "checkpoint interval (>= 1)", ParamValidators.gtEq(1)) /** @group getParam */ final def getCheckpointInterval: Int = $(checkpointInterval) @@ -223,7 +223,7 @@ private[ml] trait HasFitIntercept extends Params { * Param for whether to fit an intercept term. * @group param */ - final val fitIntercept: BooleanParam = new BooleanParam(uid, "fitIntercept", "whether to fit an intercept term") + final val fitIntercept: BooleanParam = new BooleanParam(this, "fitIntercept", "whether to fit an intercept term") setDefault(fitIntercept, true) @@ -240,7 +240,7 @@ private[ml] trait HasSeed extends Params { * Param for random seed. * @group param */ - final val seed: LongParam = new LongParam(uid, "seed", "random seed") + final val seed: LongParam = new LongParam(this, "seed", "random seed") setDefault(seed, Utils.random.nextLong()) @@ -257,7 +257,7 @@ private[ml] trait HasElasticNetParam extends Params { * Param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.. * @group param */ - final val elasticNetParam: DoubleParam = new DoubleParam(uid, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", ParamValidators.inRange(0, 1)) + final val elasticNetParam: DoubleParam = new DoubleParam(this, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", ParamValidators.inRange(0, 1)) /** @group getParam */ final def getElasticNetParam: Double = $(elasticNetParam) @@ -272,7 +272,7 @@ private[ml] trait HasTol extends Params { * Param for the convergence tolerance for iterative algorithms. * @group param */ - final val tol: DoubleParam = new DoubleParam(uid, "tol", "the convergence tolerance for iterative algorithms") + final val tol: DoubleParam = new DoubleParam(this, "tol", "the convergence tolerance for iterative algorithms") /** @group getParam */ final def getTol: Double = $(tol) @@ -287,7 +287,7 @@ private[ml] trait HasStepSize extends Params { * Param for Step size to be used for each iteration of optimization.. * @group param */ - final val stepSize: DoubleParam = new DoubleParam(uid, "stepSize", "Step size to be used for each iteration of optimization.") + final val stepSize: DoubleParam = new DoubleParam(this, "stepSize", "Step size to be used for each iteration of optimization.") /** @group getParam */ final def getStepSize: Double = $(stepSize) diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index 0c5286ef794c..b3658d181a48 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -57,7 +57,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: 10 * @group param */ - val rank = new IntParam(uid, "rank", "rank of the factorization", ParamValidators.gtEq(1)) + val rank = new IntParam(this, "rank", "rank of the factorization", ParamValidators.gtEq(1)) /** @group getParam */ def getRank: Int = $(rank) @@ -67,7 +67,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: 10 * @group param */ - val numUserBlocks = new IntParam(uid, "numUserBlocks", "number of user blocks", + val numUserBlocks = new IntParam(this, "numUserBlocks", "number of user blocks", ParamValidators.gtEq(1)) /** @group getParam */ @@ -78,7 +78,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: 10 * @group param */ - val numItemBlocks = new IntParam(uid, "numItemBlocks", "number of item blocks", + val numItemBlocks = new IntParam(this, "numItemBlocks", "number of item blocks", ParamValidators.gtEq(1)) /** @group getParam */ @@ -89,7 +89,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: false * @group param */ - val implicitPrefs = new BooleanParam(uid, "implicitPrefs", "whether to use implicit preference") + val implicitPrefs = new BooleanParam(this, "implicitPrefs", "whether to use implicit preference") /** @group getParam */ def getImplicitPrefs: Boolean = $(implicitPrefs) @@ -99,7 +99,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: 1.0 * @group param */ - val alpha = new DoubleParam(uid, "alpha", "alpha for implicit preference", + val alpha = new DoubleParam(this, "alpha", "alpha for implicit preference", ParamValidators.gtEq(0)) /** @group getParam */ @@ -110,7 +110,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: "user" * @group param */ - val userCol = new Param[String](uid, "userCol", "column name for user ids") + val userCol = new Param[String](this, "userCol", "column name for user ids") /** @group getParam */ def getUserCol: String = $(userCol) @@ -120,7 +120,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: "item" * @group param */ - val itemCol = new Param[String](uid, "itemCol", "column name for item ids") + val itemCol = new Param[String](this, "itemCol", "column name for item ids") /** @group getParam */ def getItemCol: String = $(itemCol) @@ -130,7 +130,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * Default: "rating" * @group param */ - val ratingCol = new Param[String](uid, "ratingCol", "column name for ratings") + val ratingCol = new Param[String](this, "ratingCol", "column name for ratings") /** @group getParam */ def getRatingCol: String = $(ratingCol) @@ -141,7 +141,7 @@ private[recommendation] trait ALSParams extends Params with HasMaxIter with HasR * @group param */ val nonnegative = new BooleanParam( - uid, "nonnegative", "whether to use nonnegative constraint for least squares") + this, "nonnegative", "whether to use nonnegative constraint for least squares") /** @group getParam */ def getNonnegative: Boolean = $(nonnegative) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala index 050ae0f6bc10..4249ff5c1ebc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GBTRegressor.scala @@ -99,7 +99,7 @@ final class GBTRegressor(override val uid: String) * (default = squared) * @group param */ - val lossType: Param[String] = new Param[String](uid, "lossType", "Loss function which GBT" + + val lossType: Param[String] = new Param[String](this, "lossType", "Loss function which GBT" + " tries to minimize (case-insensitive). Supported options:" + s" ${GBTRegressor.supportedLossTypes.mkString(", ")}", (value: String) => GBTRegressor.supportedLossTypes.contains(value.toLowerCase)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 44ed86962081..4a5312cfc446 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -17,19 +17,17 @@ package org.apache.spark.ml.regression -import org.apache.spark.ml.util.Identifiable - import scala.collection.mutable import breeze.linalg.{DenseVector => BDV, norm => brzNorm} -import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, - OWLQN => BreezeOWLQN} +import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN} import org.apache.spark.Logging import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasElasticNetParam, HasMaxIter, HasRegParam, HasTol} +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS._ import org.apache.spark.mllib.regression.LabeledPoint diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index e8349fa4864e..5c6ff2dda360 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -37,7 +37,7 @@ private[ml] trait CrossValidatorParams extends Params { * param for the estimator to be cross-validated * @group param */ - val estimator: Param[Estimator[_]] = new Param(uid, "estimator", "estimator for selection") + val estimator: Param[Estimator[_]] = new Param(this, "estimator", "estimator for selection") /** @group getParam */ def getEstimator: Estimator[_] = $(estimator) @@ -47,7 +47,7 @@ private[ml] trait CrossValidatorParams extends Params { * @group param */ val estimatorParamMaps: Param[Array[ParamMap]] = - new Param(uid, "estimatorParamMaps", "param maps for the estimator") + new Param(this, "estimatorParamMaps", "param maps for the estimator") /** @group getParam */ def getEstimatorParamMaps: Array[ParamMap] = $(estimatorParamMaps) @@ -57,7 +57,7 @@ private[ml] trait CrossValidatorParams extends Params { * metric * @group param */ - val evaluator: Param[Evaluator] = new Param(uid, "evaluator", + val evaluator: Param[Evaluator] = new Param(this, "evaluator", "evaluator used to select hyper-parameters that maximize the cross-validated metric") /** @group getParam */ @@ -68,7 +68,7 @@ private[ml] trait CrossValidatorParams extends Params { * Default: 3 * @group param */ - val numFolds: IntParam = new IntParam(uid, "numFolds", + val numFolds: IntParam = new IntParam(this, "numFolds", "number of folds for cross validation (>= 2)", ParamValidators.gtEq(2)) /** @group getParam */ From aa4a6118e14a915fc2d11fa8fc412b90e372b921 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Fri, 8 May 2015 15:53:37 -0700 Subject: [PATCH 09/17] fix examples/compile --- .../examples/ml/JavaDeveloperApiExample.java | 33 ++++++++++++++++--- .../examples/ml/DeveloperApiExample.scala | 11 ++++--- .../org/apache/spark/ml/param/params.scala | 2 +- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index eac4f898a475..9a822608be4d 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -28,6 +28,7 @@ import org.apache.spark.ml.classification.ClassificationModel; import org.apache.spark.ml.param.IntParam; import org.apache.spark.ml.param.ParamMap; +import org.apache.spark.ml.util.Identifiable$; import org.apache.spark.mllib.linalg.BLAS; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; @@ -105,6 +106,22 @@ public static void main(String[] args) throws Exception { class MyJavaLogisticRegression extends Classifier { + public MyJavaLogisticRegression() { + _init(); + } + + public MyJavaLogisticRegression(String uid) { + this._uid = uid; + _init(); + } + + private String _uid = Identifiable$.MODULE$.randomUID("myJavaLogReg"); + + @Override + public String uid() { + return _uid; + } + /** * Param for max number of iterations *

@@ -117,7 +134,7 @@ class MyJavaLogisticRegression int getMaxIter() { return (Integer) getOrDefault(maxIter); } - public MyJavaLogisticRegression() { + private void _init() { setMaxIter(100); } @@ -137,7 +154,7 @@ public MyJavaLogisticRegressionModel train(DataFrame dataset) { Vector weights = Vectors.zeros(numFeatures); // Learning would happen here. // Create a model, and return it. - return new MyJavaLogisticRegressionModel(this, weights); + return new MyJavaLogisticRegressionModel(uid(), weights).setParent(this); } } @@ -157,11 +174,19 @@ class MyJavaLogisticRegressionModel private Vector weights_; public Vector weights() { return weights_; } - public MyJavaLogisticRegressionModel(MyJavaLogisticRegression parent_, Vector weights_) { + public MyJavaLogisticRegressionModel(String uid, Vector weights_) { + this._uid = uid; this.parent_ = parent_; this.weights_ = weights_; } + private String _uid = Identifiable$.MODULE$.randomUID("myJavaLogReg"); + + @Override + public String uid() { + return _uid; + } + // This uses the default implementation of transform(), which reads column "features" and outputs // columns "prediction" and "rawPrediction." @@ -204,6 +229,6 @@ public Vector predictRaw(Vector features) { */ @Override public MyJavaLogisticRegressionModel copy(ParamMap extra) { - return copyValues(new MyJavaLogisticRegressionModel(parent_, weights_), extra); + return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), extra); } } diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index 2a2d0677272a..f8f9dd915b9d 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala @@ -17,6 +17,7 @@ package org.apache.spark.examples.ml +import org.apache.spark.ml.util.Identifiable import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.classification.{ClassificationModel, Classifier, ClassifierParams} import org.apache.spark.ml.param.{IntParam, ParamMap} @@ -106,10 +107,12 @@ private trait MyLogisticRegressionParams extends ClassifierParams { * * NOTE: This is private since it is an example. In practice, you may not want it to be private. */ -private class MyLogisticRegression +private class MyLogisticRegression(override val uid: String) extends Classifier[Vector, MyLogisticRegression, MyLogisticRegressionModel] with MyLogisticRegressionParams { + def this() = this(Identifiable.randomUID("myLogReg")) + setMaxIter(100) // Initialize // The parameter setter is in this class since it should return type MyLogisticRegression. @@ -125,7 +128,7 @@ private class MyLogisticRegression val weights = Vectors.zeros(numFeatures) // Learning would happen here. // Create a model, and return it. - new MyLogisticRegressionModel(this, weights) + new MyLogisticRegressionModel(uid, weights).setParent(this) } } @@ -135,7 +138,7 @@ private class MyLogisticRegression * NOTE: This is private since it is an example. In practice, you may not want it to be private. */ private class MyLogisticRegressionModel( - override val parent: MyLogisticRegression, + override val uid: String, val weights: Vector) extends ClassificationModel[Vector, MyLogisticRegressionModel] with MyLogisticRegressionParams { @@ -173,6 +176,6 @@ private class MyLogisticRegressionModel( * This is used for the default implementation of [[transform()]]. */ override def copy(extra: ParamMap): MyLogisticRegressionModel = { - copyValues(new MyLogisticRegressionModel(parent, weights), extra) + copyValues(new MyLogisticRegressionModel(uid, weights), extra) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 689a82779119..d07f547fd221 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -434,7 +434,7 @@ trait Params extends Identifiable with Serializable { * respectively. Make sure that the params are initialized before this method * gets called. */ - @varargs + // @varargs protected final def setDefault(paramPairs: ParamPair[_]*): this.type = { paramPairs.foreach { p => setDefault(p.param.asInstanceOf[Param[Any]], p.value) From 629d402569f33cb8ccf703ee0b2779bc975193b8 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Sat, 9 May 2015 10:28:10 -0700 Subject: [PATCH 10/17] fix LRSuite --- .../classification/LogisticRegression.scala | 2 +- .../LogisticRegressionSuite.scala | 35 +++++++++---------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index edd2252eda21..e607c24a7c61 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -216,7 +216,7 @@ class LogisticRegression(override val uid: String) (weightsWithIntercept, 0.0) } - new LogisticRegressionModel(this, weights.compressed, intercept) + new LogisticRegressionModel(uid, weights.compressed, intercept) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index b03dd0991f02..9119745eb6f6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -19,13 +19,12 @@ package org.apache.spark.ml.classification import org.scalatest.FunSuite -import org.apache.spark.mllib.classification.LogisticRegressionSuite +import org.apache.spark.mllib.classification.LogisticRegressionSuite._ import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row, SQLContext} - class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext { @transient var sqlContext: SQLContext = _ @@ -37,8 +36,7 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext { super.beforeAll() sqlContext = new SQLContext(sc) - dataset = sqlContext.createDataFrame(sc.parallelize(LogisticRegressionSuite - .generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42), 4)) + dataset = sqlContext.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42)) /** * Here is the instruction describing how to export the test data into CSV format @@ -60,31 +58,30 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext { val xMean = Array(5.843, 3.057, 3.758, 1.199) val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) - val testData = LogisticRegressionSuite.generateMultinomialLogisticInput( - weights, xMean, xVariance, true, nPoints, 42) + val testData = generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42) - sqlContext.createDataFrame(sc.parallelize(LogisticRegressionSuite - .generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42), 4)) + sqlContext.createDataFrame( + generateMultinomialLogisticInput(weights, xMean, xVariance, true, nPoints, 42)) } } test("logistic regression: default params") { val lr = new LogisticRegression - assert(lr.getLabelCol == "label") - assert(lr.getFeaturesCol == "features") - assert(lr.getPredictionCol == "prediction") - assert(lr.getRawPredictionCol == "rawPrediction") - assert(lr.getProbabilityCol == "probability") - assert(lr.getFitIntercept == true) + assert(lr.getLabelCol === "label") + assert(lr.getFeaturesCol === "features") + assert(lr.getPredictionCol === "prediction") + assert(lr.getRawPredictionCol === "rawPrediction") + assert(lr.getProbabilityCol === "probability") + assert(lr.getFitIntercept) val model = lr.fit(dataset) model.transform(dataset) .select("label", "probability", "prediction", "rawPrediction") .collect() assert(model.getThreshold === 0.5) - assert(model.getFeaturesCol == "features") - assert(model.getPredictionCol == "prediction") - assert(model.getRawPredictionCol == "rawPrediction") - assert(model.getProbabilityCol == "probability") + assert(model.getFeaturesCol === "features") + assert(model.getPredictionCol === "prediction") + assert(model.getRawPredictionCol === "rawPrediction") + assert(model.getProbabilityCol === "probability") assert(model.intercept !== 0.0) } @@ -134,7 +131,7 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext { assert(parent2.getRegParam === 0.1) assert(parent2.getThreshold === 0.4) assert(model2.getThreshold === 0.4) - assert(model2.getProbabilityCol == "theProb") + assert(model2.getProbabilityCol === "theProb") } test("logistic regression: Predictor, Classifier methods") { From 697fdf9841b73c838a292210b34c179313bc30f1 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Mon, 11 May 2015 23:59:12 -0700 Subject: [PATCH 11/17] update Bucketizer --- .../scala/org/apache/spark/ml/feature/Bucketizer.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 7dba64bc3506..ae8af1829338 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -18,11 +18,11 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.AlphaComponent +import org.apache.spark.ml.Model import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.util.SchemaUtils -import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @@ -32,10 +32,10 @@ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} * `Bucketizer` maps a column of continuous features to a column of feature buckets. */ @AlphaComponent -final class Bucketizer private[ml] (override val parent: Estimator[Bucketizer]) +final class Bucketizer(override val uid: String) extends Model[Bucketizer] with HasInputCol with HasOutputCol { - def this() = this(null) + def this() = this(Identifiable.randomUID("bucketizer")) /** * Parameter for mapping continuous features into buckets. With n splits, there are n+1 buckets. @@ -50,7 +50,7 @@ final class Bucketizer private[ml] (override val parent: Estimator[Bucketizer]) "should be strictly increasing. Values at -inf, inf must be explicitly provided to cover" + " all Double values; otherwise, values outside the splits specified will be treated as" + " errors.", - Bucketizer.checkSplits) + Bucketizer.checkSplits _) /** @group getParam */ def getSplits: Array[Double] = $(splits) From 5db5325e726de2200d694f1fa839799663f31e2a Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 12 May 2015 17:52:19 -0700 Subject: [PATCH 12/17] update OneVsRest --- .../spark/ml/classification/OneVsRest.scala | 18 +++++++++++------- .../apache/spark/ml/feature/Bucketizer.scala | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala index afb8d75d5738..efe7d05c915d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala @@ -25,7 +25,7 @@ import org.apache.spark.annotation.{AlphaComponent, Experimental} import org.apache.spark.ml._ import org.apache.spark.ml.attribute._ import org.apache.spark.ml.param.Param -import org.apache.spark.ml.util.MetadataUtils +import org.apache.spark.ml.util.{Identifiable, MetadataUtils} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ @@ -40,7 +40,7 @@ private[ml] trait OneVsRestParams extends PredictorParams { type ClassifierType = Classifier[F, E, M] forSome { type F type M <: ClassificationModel[F, M] - type E <: Classifier[F, E, M] + type E <: Classifier[F, E, M] } /** @@ -71,9 +71,9 @@ private[ml] trait OneVsRestParams extends PredictorParams { */ @AlphaComponent class OneVsRestModel private[ml] ( - override val parent: OneVsRest, - labelMetadata: Metadata, - val models: Array[_ <: ClassificationModel[_,_]]) + override val uid: String, + labelMetadata: Metadata, + val models: Array[_ <: ClassificationModel[_,_]]) extends Model[OneVsRestModel] with OneVsRestParams { override def transformSchema(schema: StructType): StructType = { @@ -145,7 +145,10 @@ class OneVsRestModel private[ml] ( * is picked to label the example. */ @Experimental -final class OneVsRest extends Estimator[OneVsRestModel] with OneVsRestParams { +final class OneVsRest(override val uid: String) + extends Estimator[OneVsRestModel] with OneVsRestParams { + + def this() = this(Identifiable.randomUID("oneVsRest")) /** @group setParam */ def setClassifier(value: Classifier[_,_,_]): this.type = { @@ -204,6 +207,7 @@ final class OneVsRest extends Estimator[OneVsRestModel] with OneVsRestParams { NominalAttribute.defaultAttr.withName("label").withNumValues(numClasses) case attr: Attribute => attr } - copyValues(new OneVsRestModel(this, labelAttribute.toMetadata(), models)) + val model = new OneVsRestModel(uid, labelAttribute.toMetadata(), models).setParent(this) + copyValues(model) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index 530a21f51570..d90b51a7bc75 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -54,7 +54,7 @@ final class Bucketizer(override val uid: String) "bucket, which also includes y. The splits should be strictly increasing. " + "Values at -inf, inf must be explicitly provided to cover all Double values; " + "otherwise, values outside the splits specified will be treated as errors.", - Bucketizer.checkSplits) + Bucketizer.checkSplits _) /** @group getParam */ def getSplits: Array[Double] = $(splits) From 83a163c6aa4da3d7e53434fbc4d654b385d845e7 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 12 May 2015 18:02:59 -0700 Subject: [PATCH 13/17] update JavaDeveloperApiExample --- .../examples/ml/JavaDeveloperApiExample.java | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index 9a822608be4d..ec533d174ebd 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -104,22 +104,22 @@ public static void main(String[] args) throws Exception { * However, this should still compile and run successfully. */ class MyJavaLogisticRegression - extends Classifier { + extends Classifier { public MyJavaLogisticRegression() { - _init(); + init(); } public MyJavaLogisticRegression(String uid) { - this._uid = uid; - _init(); + this.uid_ = uid; + init(); } - private String _uid = Identifiable$.MODULE$.randomUID("myJavaLogReg"); + private String uid_ = Identifiable$.MODULE$.randomUID("myJavaLogReg"); @Override public String uid() { - return _uid; + return uid_; } /** @@ -134,7 +134,7 @@ public String uid() { int getMaxIter() { return (Integer) getOrDefault(maxIter); } - private void _init() { + private void init() { setMaxIter(100); } @@ -166,25 +166,21 @@ public MyJavaLogisticRegressionModel train(DataFrame dataset) { * However, this should still compile and run successfully. */ class MyJavaLogisticRegressionModel - extends ClassificationModel { - - private MyJavaLogisticRegression parent_; - public MyJavaLogisticRegression parent() { return parent_; } + extends ClassificationModel { private Vector weights_; public Vector weights() { return weights_; } - public MyJavaLogisticRegressionModel(String uid, Vector weights_) { - this._uid = uid; - this.parent_ = parent_; - this.weights_ = weights_; + public MyJavaLogisticRegressionModel(String uid, Vector weights) { + this.uid_ = uid; + this.weights_ = weights; } - private String _uid = Identifiable$.MODULE$.randomUID("myJavaLogReg"); + private String uid_ = Identifiable$.MODULE$.randomUID("myJavaLogReg"); @Override public String uid() { - return _uid; + return uid_; } // This uses the default implementation of transform(), which reads column "features" and outputs From 409ea08bb2a381615723354177793994f19c5250 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 12 May 2015 18:16:33 -0700 Subject: [PATCH 14/17] minor updates --- .../main/scala/org/apache/spark/ml/Model.scala | 3 +++ .../scala/org/apache/spark/ml/param/params.scala | 2 +- .../apache/spark/ml/param/JavaTestParams.java | 16 ++++++++-------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/Model.scala b/mllib/src/main/scala/org/apache/spark/ml/Model.scala index f310db3128b2..7fd515369b19 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Model.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Model.scala @@ -34,6 +34,9 @@ abstract class Model[M <: Model[M]] extends Transformer { */ var parent: Estimator[M] = _ + /** + * Sets the parent of this model (Java API). + */ def setParent(parent: Estimator[M]): M = { this.parent = parent this.asInstanceOf[M] diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index b971e7d05f1b..d5c3839ae813 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -495,7 +495,7 @@ trait Params extends Identifiable with Serializable { /** Validates that the input param belongs to this instance. */ private def shouldOwn(param: Param[_]): Unit = { - require(param.parent == uid, s"Param $param does not belong to $this.") + require(param.parent == uid && hasParam(param.name), s"Param $param does not belong to $this.") } /** diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java index fbaaefeb7680..9039a7a7df67 100644 --- a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java +++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java @@ -29,20 +29,20 @@ public class JavaTestParams extends JavaParams { public JavaTestParams() { - this._uid = Identifiable$.MODULE$.randomUID("javaTestParams"); - _init(); + this.uid_ = Identifiable$.MODULE$.randomUID("javaTestParams"); + init(); } public JavaTestParams(String uid) { - this._uid = uid; - _init(); + this.uid_ = uid; + init(); } - private String _uid; + private String uid_; @Override public String uid() { - return _uid; + return uid_; } public IntParam myIntParam; @@ -63,13 +63,13 @@ public JavaTestParams setMyDoubleParam(double value) { public Param myStringParam; - public String getMyStringParam() { return (String)getOrDefault(myStringParam); } + public String getMyStringParam() { return getOrDefault(myStringParam); } public JavaTestParams setMyStringParam(String value) { set(myStringParam, value); return this; } - private void _init() { + private void init() { myIntParam = new IntParam(this, "myIntParam", "this is an int param", ParamValidators.gt(0)); myDoubleParam = new DoubleParam(this, "myDoubleParam", "this is a double param", ParamValidators.inRange(0.0, 1.0)); From 873cacaf12f339d7d0931424130bc961c2c8d0cf Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 12 May 2015 21:43:17 -0700 Subject: [PATCH 15/17] fix tests in OneVsRest; fix a racing condition in shouldOwn --- .../spark/ml/classification/OneVsRest.scala | 9 +++------ .../org/apache/spark/ml/param/params.scala | 20 ++++++++++++------- .../ml/classification/OneVsRestSuite.scala | 6 ++++-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala index efe7d05c915d..1543f051ccd1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala @@ -47,12 +47,10 @@ private[ml] trait OneVsRestParams extends PredictorParams { * param for the base binary classifier that we reduce multiclass classification into. * @group param */ - val classifier: Param[ClassifierType] = - new Param(this, "classifier", "base binary classifier ") + val classifier: Param[ClassifierType] = new Param(this, "classifier", "base binary classifier") /** @group getParam */ def getClassifier: ClassifierType = $(classifier) - } /** @@ -70,7 +68,7 @@ private[ml] trait OneVsRestParams extends PredictorParams { * (taking label 0). */ @AlphaComponent -class OneVsRestModel private[ml] ( +final class OneVsRestModel private[ml] ( override val uid: String, labelMetadata: Metadata, val models: Array[_ <: ClassificationModel[_,_]]) @@ -151,8 +149,7 @@ final class OneVsRest(override val uid: String) def this() = this(Identifiable.randomUID("oneVsRest")) /** @group setParam */ - def setClassifier(value: Classifier[_,_,_]): this.type = { - // TODO: Find a better way to do this. Existential Types don't work with Java API so cast needed + def setClassifier(value: Classifier[_, _, _]): this.type = { set(classifier, value.asInstanceOf[ClassifierType]) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index d5c3839ae813..0393b8f78549 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -27,6 +27,8 @@ import scala.collection.JavaConverters._ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.util.Identifiable +import scala.util.Try + /** * :: AlphaComponent :: * A param with self-contained documentation and optionally default value. Primitive-typed param @@ -274,6 +276,9 @@ trait Params extends Identifiable with Serializable { /** * Returns all params sorted by their names. The default implementation uses Java reflection to * list all public methods that have no arguments and return [[Param]]. + * + * Note: Developer should not use this method in constructor because we cannot guarantee that + * this variable gets initialized before other params. */ lazy val params: Array[Param[_]] = { val methods = this.getClass.getMethods @@ -308,7 +313,7 @@ trait Params extends Identifiable with Serializable { * those are checked during schema validation. */ def validateParams(): Unit = { - params.filter(isDefined _).foreach { param => + params.filter(isDefined).foreach { param => param.asInstanceOf[Param[Any]].validate($(param)) } } @@ -352,15 +357,16 @@ trait Params extends Identifiable with Serializable { } /** Tests whether this instance contains a param with a given name. */ - def hasParam(paramName: String): Boolean = { - params.exists(_.name == paramName) - } + def hasParam(paramName: String): Boolean = Try(getParam(paramName)).isSuccess /** Gets a param by its name. */ def getParam(paramName: String): Param[Any] = { - params.find(_.name == paramName).getOrElse { - throw new NoSuchElementException(s"Param $paramName does not exist.") - }.asInstanceOf[Param[Any]] + val m = this.getClass.getMethod(paramName) + if (Modifier.isPublic(m.getModifiers) && classOf[Param[_]].isAssignableFrom(m.getReturnType)) { + m.invoke(this).asInstanceOf[Param[Any]] + } else { + throw new NoSuchMethodException(s"Param $paramName does not exist.") + } } /** diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala index e65ffae918ca..990cfb08af83 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala @@ -57,7 +57,7 @@ class OneVsRestSuite extends FunSuite with MLlibTestSparkContext { test("one-vs-rest: default params") { val numClasses = 3 val ova = new OneVsRest() - ova.setClassifier(new LogisticRegression) + .setClassifier(new LogisticRegression) assert(ova.getLabelCol === "label") assert(ova.getPredictionCol === "prediction") val ovaModel = ova.fit(dataset) @@ -97,7 +97,9 @@ class OneVsRestSuite extends FunSuite with MLlibTestSparkContext { } } -private class MockLogisticRegression extends LogisticRegression { +private class MockLogisticRegression(uid: String) extends LogisticRegression(uid) { + + def this() = this("mockLogReg") setMaxIter(1) From 2569168e0a7bda1cd78c7f5956430480032b341c Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 13 May 2015 00:01:38 -0700 Subject: [PATCH 16/17] fix tests --- .../org/apache/spark/ml/param/params.scala | 14 ++++----- .../apache/spark/ml/param/JavaTestParams.java | 31 ++++++++++--------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 0393b8f78549..4703f4e68637 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -357,16 +357,15 @@ trait Params extends Identifiable with Serializable { } /** Tests whether this instance contains a param with a given name. */ - def hasParam(paramName: String): Boolean = Try(getParam(paramName)).isSuccess + def hasParam(paramName: String): Boolean = { + params.exists(_.name == paramName) + } /** Gets a param by its name. */ def getParam(paramName: String): Param[Any] = { - val m = this.getClass.getMethod(paramName) - if (Modifier.isPublic(m.getModifiers) && classOf[Param[_]].isAssignableFrom(m.getReturnType)) { - m.invoke(this).asInstanceOf[Param[Any]] - } else { - throw new NoSuchMethodException(s"Param $paramName does not exist.") - } + params.find(_.name == paramName).getOrElse { + throw new NoSuchElementException(s"Param $paramName does not exist.") + }.asInstanceOf[Param[Any]] } /** @@ -428,7 +427,6 @@ trait Params extends Identifiable with Serializable { * @param value the default value */ protected final def setDefault[T](param: Param[T], value: T): this.type = { - shouldOwn(param) defaultParamMap.put(param, value) this } diff --git a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java index 9039a7a7df67..3a41890b92d6 100644 --- a/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java +++ b/mllib/src/test/java/org/apache/spark/ml/param/JavaTestParams.java @@ -45,38 +45,41 @@ public String uid() { return uid_; } - public IntParam myIntParam; + private IntParam myIntParam_; + public IntParam myIntParam() { return myIntParam_; } - public int getMyIntParam() { return (Integer)getOrDefault(myIntParam); } + public int getMyIntParam() { return (Integer)getOrDefault(myIntParam_); } public JavaTestParams setMyIntParam(int value) { - set(myIntParam, value); return this; + set(myIntParam_, value); return this; } - public DoubleParam myDoubleParam; + private DoubleParam myDoubleParam_; + public DoubleParam myDoubleParam() { return myDoubleParam_; } - public double getMyDoubleParam() { return (Double)getOrDefault(myDoubleParam); } + public double getMyDoubleParam() { return (Double)getOrDefault(myDoubleParam_); } public JavaTestParams setMyDoubleParam(double value) { - set(myDoubleParam, value); return this; + set(myDoubleParam_, value); return this; } - public Param myStringParam; + private Param myStringParam_; + public Param myStringParam() { return myStringParam_; } - public String getMyStringParam() { return getOrDefault(myStringParam); } + public String getMyStringParam() { return getOrDefault(myStringParam_); } public JavaTestParams setMyStringParam(String value) { - set(myStringParam, value); return this; + set(myStringParam_, value); return this; } private void init() { - myIntParam = new IntParam(this, "myIntParam", "this is an int param", ParamValidators.gt(0)); - myDoubleParam = new DoubleParam(this, "myDoubleParam", "this is a double param", + myIntParam_ = new IntParam(this, "myIntParam", "this is an int param", ParamValidators.gt(0)); + myDoubleParam_ = new DoubleParam(this, "myDoubleParam", "this is a double param", ParamValidators.inRange(0.0, 1.0)); List validStrings = Lists.newArrayList("a", "b"); - myStringParam = new Param(this, "myStringParam", "this is a string param", + myStringParam_ = new Param(this, "myStringParam", "this is a string param", ParamValidators.inArray(validStrings)); - setDefault(myIntParam, 1); - setDefault(myDoubleParam, 0.5); + setDefault(myIntParam_, 1); + setDefault(myDoubleParam_, 0.5); } } From 520f0a2a4a5a7de069546ebffede69e4f613ed2f Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 13 May 2015 22:56:36 -0700 Subject: [PATCH 17/17] address comments --- .../org/apache/spark/examples/ml/DeveloperApiExample.scala | 2 +- mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala | 2 +- .../main/scala/org/apache/spark/ml/feature/Bucketizer.scala | 2 +- .../main/scala/org/apache/spark/ml/feature/StringIndexer.scala | 2 +- mllib/src/main/scala/org/apache/spark/ml/param/params.scala | 2 -- .../main/scala/org/apache/spark/ml/recommendation/ALS.scala | 3 +-- 6 files changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index f8f9dd915b9d..3ee456edbe01 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala @@ -17,10 +17,10 @@ package org.apache.spark.examples.ml -import org.apache.spark.ml.util.Identifiable import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.classification.{ClassificationModel, Classifier, ClassifierParams} import org.apache.spark.ml.param.{IntParam, ParamMap} +import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.{DataFrame, Row, SQLContext} diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index f3368ad2d364..fac54188f9f4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -174,7 +174,7 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] { */ @AlphaComponent class PipelineModel private[ml] ( - val uid: String, + override val uid: String, val stages: Array[Transformer]) extends Model[PipelineModel] with Logging { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index d90b51a7bc75..530a21f51570 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -54,7 +54,7 @@ final class Bucketizer(override val uid: String) "bucket, which also includes y. The splits should be strictly increasing. " + "Values at -inf, inf must be explicitly provided to cover all Double values; " + "otherwise, values outside the splits specified will be treated as errors.", - Bucketizer.checkSplits _) + Bucketizer.checkSplits) /** @group getParam */ def getSplits: Array[Double] = $(splits) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 63055050b892..3f79b67309f0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -19,11 +19,11 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.annotation.AlphaComponent -import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{NumericType, StringType, StructType} diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index 4703f4e68637..bb3998c5629d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -27,8 +27,6 @@ import scala.collection.JavaConverters._ import org.apache.spark.annotation.AlphaComponent import org.apache.spark.ml.util.Identifiable -import scala.util.Try - /** * :: AlphaComponent :: * A param with self-contained documentation and optionally default value. Primitive-typed param diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index b70dedbd590a..45c57b50da70 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -306,8 +306,7 @@ class ALS(override val uid: String) extends Estimator[ALSModel] with ALSParams { maxIter = $(maxIter), regParam = $(regParam), implicitPrefs = $(implicitPrefs), alpha = $(alpha), nonnegative = $(nonnegative), checkpointInterval = $(checkpointInterval), seed = $(seed)) - val model = new ALSModel(uid, $(rank), userFactors, itemFactors) - .setParent(this) + val model = new ALSModel(uid, $(rank), userFactors, itemFactors).setParent(this) copyValues(model) }