update

VinceShieh · VinceShieh · commit 9bcaffc19e7a · 2017-03-01T10:09:36.000+08:00
Signed-off-by: VinceShieh &lt;vincent.xie@intel.com&gt;
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -502,7 +502,8 @@ for more details on the API.
 ## StringIndexer
 
 `StringIndexer` encodes a string column of labels to a column of label indices.
-The indices are in `[0, numLabels]`, ordered by label frequencies, so the most frequent label gets index `0`.
+The indices are in `[0, numLabels)`, ordered by label frequencies, so the most frequent label gets index `0`.
+The unseen labels will be put at index numLabels if user chooses to keep them.
 If the input column is numeric, we cast it to string and index the string
 values. When downstream pipeline components such as `Estimator` or
 `Transformer` make use of this string-indexed label, you must set the input
@@ -580,7 +581,7 @@ will be generated:
 
 Notice that the rows containing "d" or "e" do not appear.
 
-If you had called `setHandleInvalid("keep")`, the following dataset
+If you call `setHandleInvalid("keep")`, the following dataset
 will be generated:
 
 ~~~~
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.ml.feature
 
-import scala.language.existentials
-
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
@@ -37,24 +35,26 @@ import org.apache.spark.util.collection.OpenHashMap
  * Base trait for [[StringIndexer]] and [[StringIndexerModel]].
  */
 private[feature] trait StringIndexerBase extends Params with HasInputCol with HasOutputCol {
-  val SKIP_UNSEEN_LABEL: String = "skip"
-  val ERROR_UNSEEN_LABEL: String = "error"
-  val KEEP_UNSEEN_LABEL: String = "keep"
-  val supportedHandleInvalids: Array[String] =
-    Array(SKIP_UNSEEN_LABEL, ERROR_UNSEEN_LABEL, KEEP_UNSEEN_LABEL)
 
   /**
    * Param for how to handle unseen labels. Options are 'skip' (filter out rows with
-   * unseen labels), 'error' (throw an error), or 'keep' (map unseen labels with
-   * indices [numLabels]).
+   * unseen labels), 'error' (throw an error), or 'keep' (put unseen labels in a special additional
+   * bucket, at index numLabels.
    * Default: "error"
    * @group param
    */
   @Since("2.1.0")
   val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
     "unseen labels. Options are 'skip' (filter out rows with unseen labels), " +
-    "error (throw an error), or 'keep' (map unseen labels with indices [numLabels]).",
-    ParamValidators.inArray(supportedHandleInvalids))
+    "error (throw an error), or 'keep' (put unseen labels in a special additional bucket," +
+    "at index numLabels).",
+    ParamValidators.inArray(StringIndexer.supportedHandleInvalids))
+
+  setDefault(handleInvalid, StringIndexer.ERROR_UNSEEN_LABEL)
+
+  /** @group getParam */
+  @Since("2.1.0")
+  def getHandleInvalid: String = $(handleInvalid)
 
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
@@ -97,14 +97,9 @@ class StringIndexer @Since("1.4.0") (
   @Since("1.4.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /** @group getParam */
-  @Since("2.1.0")
-  def getHandleInvalid: String = $(handleInvalid)
-
   /** @group setParam */
-  @Since("2.1.0")
+  @Since("2.2.0")
   def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
-  setDefault(handleInvalid, ERROR_UNSEEN_LABEL)
 
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): StringIndexerModel = {
@@ -128,7 +123,11 @@ class StringIndexer @Since("1.4.0") (
 
 @Since("1.6.0")
 object StringIndexer extends DefaultParamsReadable[StringIndexer] {
-
+  private[feature] val SKIP_UNSEEN_LABEL: String = "skip"
+  private[feature] val ERROR_UNSEEN_LABEL: String = "error"
+  private[feature] val KEEP_UNSEEN_LABEL: String = "keep"
+  private[feature] val supportedHandleInvalids: Array[String] =
+    Array(SKIP_UNSEEN_LABEL, ERROR_UNSEEN_LABEL, KEEP_UNSEEN_LABEL)
   @Since("1.6.0")
   override def load(path: String): StringIndexer = super.load(path)
 }
@@ -172,14 +171,10 @@ class StringIndexerModel (
   @Since("1.4.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /** @group getParam */
-  @Since("2.1.0")
-  def getHandleInvalid: String = $(handleInvalid)
-
   /** @group setParam */
   @Since("2.1.0")
   def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
-  setDefault(handleInvalid, ERROR_UNSEEN_LABEL)
+  setDefault(handleInvalid, StringIndexer.ERROR_UNSEEN_LABEL)
 
   @Since("2.0.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
@@ -194,12 +189,12 @@ class StringIndexerModel (
       .withName($(outputCol)).withValues(labels).toMetadata()
     // If we are skipping invalid records, filter them out.
     val (filteredDataset, keepInvalid) = getHandleInvalid match {
-      case SKIP_UNSEEN_LABEL =>
+      case StringIndexer.SKIP_UNSEEN_LABEL =>
         val filterer = udf { label: String =>
           labelToIndex.contains(label)
         }
         (dataset.where(filterer(dataset($(inputCol)))), false)
-      case _ => (dataset, getHandleInvalid == KEEP_UNSEEN_LABEL)
+      case _ => (dataset, getHandleInvalid == StringIndexer.KEEP_UNSEEN_LABEL)
     }
 
     val indexer = udf { label: String =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -78,11 +78,11 @@ class StringIndexerSuite
 
     indexer.setHandleInvalid("skip")
     // Verify that we skip the c record
-    var transformed = indexer.transform(df2)
-    var attr = Attribute.fromStructField(transformed.schema("labelIndex"))
+    val transformedSkip = indexer.transform(df2)
+    val attrSkip = Attribute.fromStructField(transformedSkip.schema("labelIndex"))
       .asInstanceOf[NominalAttribute]
-    assert(attr.values.get === Array("b", "a"))
-    val outputSkip = transformed.select("id", "labelIndex").rdd.map { r =>
+    assert(attrSkip.values.get === Array("b", "a"))
+    val outputSkip = transformedSkip.select("id", "labelIndex").rdd.map { r =>
       (r.getInt(0), r.getDouble(1))
     }.collect().toSet
     // a -> 1, b -> 0
@@ -91,11 +91,11 @@ class StringIndexerSuite
 
     indexer.setHandleInvalid("keep")
     // Verify that we keep the unseen records
-    transformed = indexer.transform(df2)
-    attr = Attribute.fromStructField(transformed.schema("labelIndex"))
+    val transformedKeep = indexer.transform(df2)
+    val attrKeep = Attribute.fromStructField(transformedKeep.schema("labelIndex"))
       .asInstanceOf[NominalAttribute]
-    assert(attr.values.get === Array("b", "a"))
-    val outputKeep = transformed.select("id", "labelIndex").rdd.map { r =>
+    assert(attrKeep.values.get === Array("b", "a"))
+    val outputKeep = transformedKeep.select("id", "labelIndex").rdd.map { r =>
       (r.getInt(0), r.getDouble(1))
     }.collect().toSet
     // a -> 1, b -> 0, c -> 2, d -> 3