Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2adebe8
add a chiSquare Selector based on False Positive Rate (FPR) test
Aug 10, 2016
04053ca
Merge remote-tracking branch 'origin/master' into fprChiSquare
Aug 11, 2016
7623563
Configure the ChiSqSelector to reuse ChiSqTestResult by numTopFeature…
Aug 16, 2016
3d6aecb
Config the ChiSqSelector to reuse the ChiSqTestResult by KBest, Perce…
Aug 17, 2016
026ac85
Merge branch 'master' into fprChiSquare2
Aug 17, 2016
5305709
add Since annotation
Aug 17, 2016
1e8d83a
Not reuse the ChiSqTestResult to be consistent with other methods
Aug 22, 2016
85a17dd
fix Percentile bugs, optimize the code
Aug 22, 2016
61b71c8
change the default value of Percentile
Aug 22, 2016
d7b2892
Add require for setAlpha value
Aug 23, 2016
6699396
rm isSorted function, change gtEq(0) to inRange(0,1) for percentile a…
Aug 23, 2016
b8986b5
Optimize fit function of ml ChiSqSelector
Aug 23, 2016
5c2e44c
Fpr to FPR, sort all cases in fit
Aug 24, 2016
0d3967a
Add Python API for ChiSqSelector
Aug 29, 2016
1dc6a8e
split the ChiSqSelector param to numTopFeateres, Percentile, Alpha in…
Sep 5, 2016
9908871
Add type check for Python ChiSqSelector
Sep 5, 2016
bbccac7
Change the exception type of value check
Sep 6, 2016
c35bcf1
change python code style
Sep 13, 2016
e8f03ed
change python code style
Sep 13, 2016
ec74dde
revert isSort to pass MiMa test
Sep 13, 2016
6398f4c
Change MimaExcludes
Sep 14, 2016
6cc4c92
Change Mima conflict
Sep 14, 2016
1d2f67f
add javadoc
Sep 18, 2016
6220dd5
fix mima conflict
Sep 18, 2016
ce3f8fb
Merge remote-tracking branch 'origin/master' into fprChiSquare
Sep 19, 2016
88d2143
change javadoc
Sep 19, 2016
24f26f2
fix mima conflict
Sep 20, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.feature.ChiSqSelectorType
import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
import org.apache.spark.rdd.RDD
Expand Down Expand Up @@ -54,11 +55,47 @@ private[feature] trait ChiSqSelectorParams extends Params

/** @group getParam */
def getNumTopFeatures: Int = $(numTopFeatures)

final val percentile = new DoubleParam(this, "percentile",
"Percentile of features that selector will select, ordered by statistics value descending.",
ParamValidators.inRange(0, 1))
setDefault(percentile -> 0.1)

/** @group getParam */
def getPercentile: Double = $(percentile)

final val alpha = new DoubleParam(this, "alpha",
"The highest p-value for features to be kept.",
ParamValidators.inRange(0, 1))
setDefault(alpha -> 0.05)

/** @group getParam */
def getAlpha: Double = $(alpha)

/**
* The ChiSqSelector supports KBest, Percentile, FPR selection,
* which is the same as ChiSqSelectorType defined in MLLIB.
* when call setNumTopFeatures, the selectorType is set to KBest
* when call setPercentile, the selectorType is set to Percentile
* when call setAlpha, the selectorType is set to FPR
*/
final val selectorType = new Param[String](this, "selectorType",
"ChiSqSelector Type: KBest, Percentile, FPR")
setDefault(selectorType -> ChiSqSelectorType.KBest.toString)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somewhere, there should be a few brief sentences describing how the types relate to the parameters to this class.


/** @group getParam */
def getChiSqSelectorType: String = $(selectorType)
}

/**
* Chi-Squared feature selection, which selects categorical features to use for predicting a
* categorical label.
* The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
* `KBest` chooses the `k` top features according to a chi-squared test.
* `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
* `FPR` chooses all features whose false positive rate meets some threshold.
* By default, the selection method is `KBest`, the default number of top features is 50.
* User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
*/
@Since("1.6.0")
final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: String)
Expand All @@ -69,7 +106,22 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str

/** @group setParam */
@Since("1.6.0")
def setNumTopFeatures(value: Int): this.type = set(numTopFeatures, value)
def setNumTopFeatures(value: Int): this.type = {
set(selectorType, ChiSqSelectorType.KBest.toString)
set(numTopFeatures, value)
}

@Since("2.1.0")
def setPercentile(value: Double): this.type = {
set(selectorType, ChiSqSelectorType.Percentile.toString)
set(percentile, value)
}

@Since("2.1.0")
def setAlpha(value: Double): this.type = {
set(selectorType, ChiSqSelectorType.FPR.toString)
set(alpha, value)
}

/** @group setParam */
@Since("1.6.0")
Expand All @@ -91,8 +143,19 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
case Row(label: Double, features: Vector) =>
OldLabeledPoint(label, OldVectors.fromML(features))
}
val chiSqSelector = new feature.ChiSqSelector($(numTopFeatures)).fit(input)
copyValues(new ChiSqSelectorModel(uid, chiSqSelector).setParent(this))
var selector = new feature.ChiSqSelector()
ChiSqSelectorType.withName($(selectorType)) match {
case ChiSqSelectorType.KBest =>
selector.setNumTopFeatures($(numTopFeatures))
case ChiSqSelectorType.Percentile =>
selector.setPercentile($(percentile))
case ChiSqSelectorType.FPR =>
selector.setAlpha($(alpha))
case errorType =>
throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
}
val model = selector.fit(input)
copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
}

@Since("1.6.0")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -629,13 +629,35 @@ private[python] class PythonMLLibAPI extends Serializable {
}

/**
* Java stub for ChiSqSelector.fit(). This stub returns a
* Java stub for ChiSqSelector.fit() when the seletion type is KBest. This stub returns a
* handle to the Java object instead of the content of the Java object.
* Extra care needs to be taken in the Python code to ensure it gets freed on
* exit; see the Py4J documentation.
*/
def fitChiSqSelector(numTopFeatures: Int, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
new ChiSqSelector(numTopFeatures).fit(data.rdd)
def fitChiSqSelectorKBest(numTopFeatures: Int,
data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
new ChiSqSelector().setNumTopFeatures(numTopFeatures).fit(data.rdd)
}

/**
* Java stub for ChiSqSelector.fit() when the selection type is Percentile. This stub returns a
* handle to the Java object instead of the content of the Java object.
* Extra care needs to be taken in the Python code to ensure it gets freed on
* exit; see the Py4J documentation.
*/
def fitChiSqSelectorPercentile(percentile: Double,
data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
new ChiSqSelector().setPercentile(percentile).fit(data.rdd)
}

/**
* Java stub for ChiSqSelector.fit() when the selection type is FPR. This stub returns a
* handle to the Java object instead of the content of the Java object.
* Extra care needs to be taken in the Python code to ensure it gets freed on
* exit; see the Py4J documentation.
*/
def fitChiSqSelectorFPR(alpha: Double, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
new ChiSqSelector().setAlpha(alpha).fit(data.rdd)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,21 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.sql.{Row, SparkSession}

@Since("2.1.0")
private[spark] object ChiSqSelectorType extends Enumeration {
type SelectorType = Value
val KBest, Percentile, FPR = Value
}

/**
* Chi Squared selector model.
*
* @param selectedFeatures list of indices to select (filter). Must be ordered asc
* @param selectedFeatures list of indices to select (filter).
*/
@Since("1.3.0")
class ChiSqSelectorModel @Since("1.3.0") (
@Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable {

require(isSorted(selectedFeatures), "Array has to be sorted asc")

protected def isSorted(array: Array[Int]): Boolean = {
var i = 1
val len = array.length
while (i < len) {
if (array(i) < array(i-1)) return false
i += 1
}
true
}

/**
* Applies transformation on a vector.
*
Expand All @@ -69,21 +63,22 @@ class ChiSqSelectorModel @Since("1.3.0") (
* Preserves the order of filtered features the same as their indices are stored.
* Might be moved to Vector as .slice
* @param features vector
* @param filterIndices indices of features to filter, must be ordered asc
* @param filterIndices indices of features to filter
*/
private def compress(features: Vector, filterIndices: Array[Int]): Vector = {
val orderedIndices = filterIndices.sorted
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be computed once and stored, rather than store unsorted indices and resort them.

features match {
case SparseVector(size, indices, values) =>
val newSize = filterIndices.length
val newSize = orderedIndices.length
val newValues = new ArrayBuilder.ofDouble
val newIndices = new ArrayBuilder.ofInt
var i = 0
var j = 0
var indicesIdx = 0
var filterIndicesIdx = 0
while (i < indices.length && j < filterIndices.length) {
while (i < indices.length && j < orderedIndices.length) {
indicesIdx = indices(i)
filterIndicesIdx = filterIndices(j)
filterIndicesIdx = orderedIndices(j)
if (indicesIdx == filterIndicesIdx) {
newIndices += j
newValues += values(i)
Expand All @@ -101,7 +96,7 @@ class ChiSqSelectorModel @Since("1.3.0") (
Vectors.sparse(newSize, newIndices.result(), newValues.result())
case DenseVector(values) =>
val values = features.toArray
Vectors.dense(filterIndices.map(i => values(i)))
Vectors.dense(orderedIndices.map(i => values(i)))
case other =>
throw new UnsupportedOperationException(
s"Only sparse and dense vectors are supported but got ${other.getClass}.")
Expand Down Expand Up @@ -171,14 +166,57 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {

/**
* Creates a ChiSquared feature selector.
* @param numTopFeatures number of features that selector will select
* (ordered by statistic value descending)
* Note that if the number of features is less than numTopFeatures,
* then this will select all features.
* The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
* `KBest` chooses the `k` top features according to a chi-squared test.
* `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
* `FPR` chooses all features whose false positive rate meets some threshold.
* By default, the selection method is `KBest`, the default number of top features is 50.
* User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
*/
@Since("1.3.0")
class ChiSqSelector @Since("1.3.0") (
@Since("1.3.0") val numTopFeatures: Int) extends Serializable {
class ChiSqSelector @Since("2.1.0") () extends Serializable {
var numTopFeatures: Int = 50
var percentile: Double = 0.1
var alpha: Double = 0.05
var selectorType = ChiSqSelectorType.KBest

/**
* The is the same to call this() and setNumTopFeatures(numTopFeatures)
*/
@Since("1.3.0")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The existing constructor should still have javadoc maybe pointing to the setNumTopFeatures method to say that's the effect it has

def this(numTopFeatures: Int) {
this()
this.numTopFeatures = numTopFeatures
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should call setNumTopFeatures to set the type too?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not necessary, because the default selectorType is KBest

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. It seemed to split the logic a bit here but it's not bad. The default behavior needs to be documented then. Now there is effectively a default numTopFeatures.

}

@Since("1.6.0")
def setNumTopFeatures(value: Int): this.type = {
numTopFeatures = value
selectorType = ChiSqSelectorType.KBest
this
}

@Since("2.1.0")
def setPercentile(value: Double): this.type = {
require(0.0 <= value && value <= 1.0, "Percentile must be in [0,1]")
percentile = value
selectorType = ChiSqSelectorType.Percentile
this
}

@Since("2.1.0")
def setAlpha(value: Double): this.type = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it need a require for alpha here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

require is added, thanks

require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]")
alpha = value
selectorType = ChiSqSelectorType.FPR
this
}

@Since("2.1.0")
def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = {
selectorType = value
this
}

/**
* Returns a ChiSquared feature selector.
Expand All @@ -189,11 +227,20 @@ class ChiSqSelector @Since("1.3.0") (
*/
@Since("1.3.0")
def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
val indices = Statistics.chiSqTest(data)
val chiSqTestResult = Statistics.chiSqTest(data)
.zipWithIndex.sortBy { case (res, _) => -res.statistic }
.take(numTopFeatures)
.map { case (_, indices) => indices }
.sorted
val features = selectorType match {
case ChiSqSelectorType.KBest => chiSqTestResult
.take(numTopFeatures)
case ChiSqSelectorType.Percentile => chiSqTestResult
.take((chiSqTestResult.length * percentile).toInt)
case ChiSqSelectorType.FPR => chiSqTestResult
.filter{ case (res, _) => res.pValue < alpha }
case errorType =>
throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
}
val indices = features.map { case (_, indices) => indices }
new ChiSqSelectorModel(indices)
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,23 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
.map(x => (x._1.label, x._1.features, x._2))
.toDF("label", "data", "preFilteredData")

val model = new ChiSqSelector()
val selector = new ChiSqSelector()
.setNumTopFeatures(1)
.setFeaturesCol("data")
.setLabelCol("label")
.setOutputCol("filtered")

model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
case Row(vec1: Vector, vec2: Vector) =>
assert(vec1 ~== vec2 absTol 1e-1)
}

selector.setPercentile(0.34).fit(df).transform(df)
.select("filtered", "preFilteredData").collect().foreach {
case Row(vec1: Vector, vec2: Vector) =>
assert(vec1 ~== vec2 absTol 1e-1)
}

}

test("ChiSqSelector read/write") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,24 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
assert(filteredData == preFilteredData)
}

test("ChiSqSelector by FPR transform test (sparse & dense vector)") {
val labeledDiscreteData = sc.parallelize(
Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
val preFilteredData =
Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
LabeledPoint(1.0, Vectors.dense(Array(4.0))),
LabeledPoint(1.0, Vectors.dense(Array(4.0))),
LabeledPoint(2.0, Vectors.dense(Array(9.0))))
val model = new ChiSqSelector().setAlpha(0.1).fit(labeledDiscreteData)
val filteredData = labeledDiscreteData.map { lp =>
LabeledPoint(lp.label, model.transform(lp.features))
}.collect().toSet
assert(filteredData == preFilteredData)
}

test("model load / save") {
val model = ChiSqSelectorSuite.createModel()
val tempDir = Utils.createTempDir()
Expand Down
3 changes: 3 additions & 0 deletions project/MimaExcludes.scala
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,9 @@ object MimaExcludes {
) ++ Seq(
// [SPARK-17163] Unify logistic regression interface. Private constructor has new signature.
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.this")
) ++ Seq(
// [SPARK-17017] Add chiSquare selector based on False Positive Rate (FPR) test
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.feature.ChiSqSelectorModel.isSorted")
)
}

Expand Down
Loading