apache · mpjlu · Sep 23, 2016 · Sep 27, 2016 · Sep 27, 2016 · Sep 27, 2016
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -1423,12 +1423,12 @@ for more details on the API.
 `ChiSqSelector` stands for Chi-Squared feature selection. It operates on labeled data with
 categorical features. ChiSqSelector uses the
 [Chi-Squared test of independence](https://en.wikipedia.org/wiki/Chi-squared_test) to decide which
-features to choose. It supports three selection methods: `numTopFeatures`, `percentile`, `fpr`:
-
+features to choose. It supports five selection methods: `numTopFeatures`, `percentile`, `fpr`, `fdr`, `fwe`:
 * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. This is akin to yielding the features with the most predictive power.
 * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number.
 * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection.
-
+* `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold.
+* `fwe` chooses all features whose p-values is below a threshold, thus controlling the family-wise error rate of selection.
 By default, the selection method is `numTopFeatures`, with the default number of top features set to 50.
 The user can choose a selection method using `setSelectorType`.
 

diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
@@ -227,11 +227,13 @@ both speed and statistical learning behavior.
 [`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) implements
 Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the
 [Chi-Squared test of independence](https://en.wikipedia.org/wiki/Chi-squared_test) to decide which
-features to choose. It supports three selection methods: `numTopFeatures`, `percentile`, `fpr`:
+features to choose. It supports five selection methods: `numTopFeatures`, `percentile`, `fpr`, `fdr`, `fwe`:
 
 * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. This is akin to yielding the features with the most predictive power.
 * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number.
 * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection.
+* `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold.
+* `fwe` chooses all features whose p-values is below a threshold, thus controlling the family-wise error rate of selection.
 
 By default, the selection method is `numTopFeatures`, with the default number of top features set to 50.
 The user can choose a selection method using `setSelectorType`.

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -91,9 +91,37 @@ private[feature] trait ChiSqSelectorParams extends Params
   @Since("2.1.0")
   def getFpr: Double = $(fpr)
 
+  /**
+   * The upper bound of the expected false discovery rate.
+   * Only applicable when selectorType = "fdr".
+   * Default value is 0.05.
+   * @group param
+   */
+  @Since("2.2.0")
+  final val fdr = new DoubleParam(this, "fdr",
+    "The upper bound of the expected false discovery rate.", ParamValidators.inRange(0, 1))
+  setDefault(fdr -> 0.05)
+
+  /** @group getParam */
+  def getFdr: Double = $(fdr)
+
+  /**
+   * The upper bound of the expected family-wise error rate.
+   * Only applicable when selectorType = "fwe".
+   * Default value is 0.05.
+   * @group param
+   */
+  @Since("2.2.0")
+  final val fwe = new DoubleParam(this, "fwe",
+    "The upper bound of the expected family-wise error rate.", ParamValidators.inRange(0, 1))
+  setDefault(fwe -> 0.05)
+
+  /** @group getParam */
+  def getFwe: Double = $(fwe)
+
   /**
    * The selector type of the ChisqSelector.
-   * Supported options: "numTopFeatures" (default), "percentile", "fpr".
+   * Supported options: "numTopFeatures" (default), "percentile", "fpr", "fdr", "fwe".
    * @group param
    */
   @Since("2.1.0")
@@ -111,11 +139,17 @@ private[feature] trait ChiSqSelectorParams extends Params
 /**
  * Chi-Squared feature selection, which selects categorical features to use for predicting a
  * categorical label.
- * The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`.
+ * The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`,
+ * `fdr`, `fwe`.
  *  - `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
  *  - `percentile` is similar but chooses a fraction of all features instead of a fixed number.
  *  - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
  *    positive rate of selection.
+ *  - `fdr` uses the [Benjamini-Hochberg procedure]
+ *    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
+ *    to choose all features whose false discovery rate is below a threshold.
+ *  - `fwe` chooses all features whose p-values is below a threshold,
+ *    thus controlling the family-wise error rate of selection.
  * By default, the selection method is `numTopFeatures`, with the default number of top features
  * set to 50.
  */
@@ -138,6 +172,14 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
   @Since("2.1.0")
   def setFpr(value: Double): this.type = set(fpr, value)
 
+  /** @group setParam */
+  @Since("2.2.0")
+  def setFdr(value: Double): this.type = set(fdr, value)
+
+  /** @group setParam */
+  @Since("2.2.0")
+  def setFwe(value: Double): this.type = set(fwe, value)
+
   /** @group setParam */
   @Since("2.1.0")
   def setSelectorType(value: String): this.type = set(selectorType, value)
@@ -167,6 +209,8 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
       .setNumTopFeatures($(numTopFeatures))
       .setPercentile($(percentile))
       .setFpr($(fpr))
+      .setFdr($(fdr))
+      .setFwe($(fwe))
     val model = selector.fit(input)
     copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
   }

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -639,12 +639,16 @@ private[python] class PythonMLLibAPI extends Serializable {
       numTopFeatures: Int,
       percentile: Double,
       fpr: Double,
+      fdr: Double,
+      fwe: Double,
       data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
     new ChiSqSelector()
       .setSelectorType(selectorType)
       .setNumTopFeatures(numTopFeatures)
       .setPercentile(percentile)
       .setFpr(fpr)
+      .setFdr(fdr)
+      .setFwe(fwe)
       .fit(data.rdd)
   }
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -171,11 +171,17 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 
 /**
  * Creates a ChiSquared feature selector.
- * The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`.
+ * The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`,
+ * `fdr`, `fwe`.
  *  - `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
  *  - `percentile` is similar but chooses a fraction of all features instead of a fixed number.
  *  - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
  *    positive rate of selection.
+ *  - `fdr` uses the [Benjamini-Hochberg procedure]
+ *    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
+ *    to choose all features whose false discovery rate is below a threshold.
+ *  - `fwe` chooses all features whose p-values is below a threshold,
+ *    thus controlling the family-wise error rate of selection.
  * By default, the selection method is `numTopFeatures`, with the default number of top features
  * set to 50.
  */
@@ -184,6 +190,8 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
   var numTopFeatures: Int = 50
   var percentile: Double = 0.1
   var fpr: Double = 0.05
+  var fdr: Double = 0.05
+  var fwe: Double = 0.05
   var selectorType = ChiSqSelector.NumTopFeatures
 
   /**
@@ -215,6 +223,20 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
     this
   }
 
+  @Since("2.2.0")
+  def setFdr(value: Double): this.type = {
+    require(0.0 <= value && value <= 1.0, "FDR must be in [0,1]")
+    fdr = value
+    this
+  }
+
+  @Since("2.2.0")
+  def setFwe(value: Double): this.type = {
+    require(0.0 <= value && value <= 1.0, "FWE must be in [0,1]")
+    fwe = value
+    this
+  }
+
   @Since("2.1.0")
   def setSelectorType(value: String): this.type = {
     require(ChiSqSelector.supportedSelectorTypes.contains(value),
@@ -245,6 +267,21 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
       case ChiSqSelector.FPR =>
         chiSqTestResult
           .filter { case (res, _) => res.pValue < fpr }
+      case ChiSqSelector.FDR =>
+        // This uses the Benjamini-Hochberg procedure.
+        // https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure
+        val tempRes = chiSqTestResult
+          .sortBy { case (res, _) => res.pValue }
+        val maxIndex = tempRes
+          .zipWithIndex
+          .filter { case ((res, _), index) =>
+            res.pValue <= fdr * (index + 1) / chiSqTestResult.length }
+          .map { case (_, index) => index }
+          .max
+        tempRes.take(maxIndex + 1)
+      case ChiSqSelector.FWE =>
+        chiSqTestResult
+          .filter { case (res, _) => res.pValue < fwe / chiSqTestResult.length }
       case errorType =>
         throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
     }
@@ -255,19 +292,22 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 
 private[spark] object ChiSqSelector {
 
-  /**
-   * String name for `numTopFeatures` selector type.
-   */
-  val NumTopFeatures: String = "numTopFeatures"
+  /** String name for `numTopFeatures` selector type. */
+  private[spark] val NumTopFeatures: String = "numTopFeatures"
 
-  /**
-   * String name for `percentile` selector type.
-   */
-  val Percentile: String = "percentile"
+  /** String name for `percentile` selector type. */
+  private[spark] val Percentile: String = "percentile"
 
   /** String name for `fpr` selector type. */
-  val FPR: String = "fpr"
+  private[spark] val FPR: String = "fpr"
+
+  /** String name for `fdr` selector type. */
+  private[spark] val FDR: String = "fdr"
+
+  /** String name for `fwe` selector type. */
+  private[spark] val FWE: String = "fwe"
+
 
   /** Set of selector types that ChiSqSelector supports. */
-  val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR)
+  val supportedSelectorTypes: Array[String] = Array(NumTopFeatures, Percentile, FPR, FDR, FWE)
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -79,6 +79,12 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
     ChiSqSelectorSuite.testSelector(selector, dataset)
   }
 
+  test("Test Chi-Square selector: fwe") {
+    val selector = new ChiSqSelector()
+      .setOutputCol("filtered").setSelectorType("fwe").setFwe(0.6)
+    ChiSqSelectorSuite.testSelector(selector, dataset)
+  }
+
   test("read/write") {
     def checkModelData(model: ChiSqSelectorModel, model2: ChiSqSelectorModel): Unit = {
       assert(model.selectedFeatures === model2.selectedFeatures)