Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
package org.apache.spark.ml.clustering

import org.apache.hadoop.fs.Path
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkException
import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.{Estimator, Model}
Expand Down Expand Up @@ -457,6 +460,31 @@ sealed abstract class LDAModel private[ml] (
def describeTopics(): DataFrame = describeTopics(10)
}

object LDAModel extends MLReadable[LDAModel] {

private class LDAModelReader extends MLReader[LDAModel] {
override def load(path: String): LDAModel = {
val metadataPath = new Path(path, "metadata").toString
val metadata = parse(sc.textFile(metadataPath, 1).first())
implicit val format = DefaultFormats
val className = (metadata \ "class").extract[String]
className match {
case c if className == classOf[LocalLDAModel].getName =>
LocalLDAModel.load(path)
case c if className == classOf[DistributedLDAModel].getName =>
DistributedLDAModel.load(path)
case _ => throw new SparkException(s"$className in $path is not a LDAModel")
}
}
}

@Since("2.0.0")
override def read: MLReader[LDAModel] = new LDAModelReader

@Since("2.0.0")
override def load(path: String): LDAModel = super.load(path)
}


/**
* :: Experimental ::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,10 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
*/
def testOneSample(data: RDD[Double], cdf: Double => Double): KolmogorovSmirnovTestResult = {
val n = data.count().toDouble
val localData = data.sortBy(x => x).mapPartitions { part =>
val partDiffs = oneSampleDifferences(part, n, cdf) // local distances
searchOneSampleCandidates(partDiffs) // candidates: local extrema
}.collect()
val ksStat = searchOneSampleStatistic(localData, n) // result: global extreme
val ksStat = data.sortBy(x => x).zipWithIndex().map { case (v, i) =>
val f = cdf(v)
math.max(f - i / n, (i + 1) / n - f)
}.max()
evalOneSampleP(ksStat, n.toLong)
}

Expand All @@ -84,74 +83,6 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
testOneSample(data, cdf)
}

/**
* Calculate unadjusted distances between the empirical CDF and the theoretical CDF in a
* partition
* @param partData `Iterator[Double]` 1 partition of a sorted RDD
* @param n `Double` the total size of the RDD
* @param cdf `Double => Double` a function the calculates the theoretical CDF of a value
* @return `Iterator[(Double, Double)] `Unadjusted (ie. off by a constant) potential extrema
* in a partition. The first element corresponds to the (empirical CDF - 1/N) - CDF,
* the second element corresponds to empirical CDF - CDF. We can then search the resulting
* iterator for the minimum of the first and the maximum of the second element, and provide
* this as a partition's candidate extrema
*/
private def oneSampleDifferences(partData: Iterator[Double], n: Double, cdf: Double => Double)
: Iterator[(Double, Double)] = {
// zip data with index (within that partition)
// calculate local (unadjusted) empirical CDF and subtract CDF
partData.zipWithIndex.map { case (v, ix) =>
// dp and dl are later adjusted by constant, when global info is available
val dp = (ix + 1) / n
val dl = ix / n
val cdfVal = cdf(v)
(dl - cdfVal, dp - cdfVal)
}
}

/**
* Search the unadjusted differences in a partition and return the
* two extrema (furthest below and furthest above CDF), along with a count of elements in that
* partition
* @param partDiffs `Iterator[(Double, Double)]` the unadjusted differences between empirical CDF
* and CDFin a partition, which come as a tuple of
* (empirical CDF - 1/N - CDF, empirical CDF - CDF)
* @return `Iterator[(Double, Double, Double)]` the local extrema and a count of elements
*/
private def searchOneSampleCandidates(partDiffs: Iterator[(Double, Double)])
: Iterator[(Double, Double, Double)] = {
val initAcc = (Double.MaxValue, Double.MinValue, 0.0)
val pResults = partDiffs.foldLeft(initAcc) { case ((pMin, pMax, pCt), (dl, dp)) =>
(math.min(pMin, dl), math.max(pMax, dp), pCt + 1)
}
val results = if (pResults == initAcc) Array[(Double, Double, Double)]() else Array(pResults)
results.iterator
}

/**
* Find the global maximum distance between empirical CDF and CDF (i.e. the KS statistic) after
* adjusting local extrema estimates from individual partitions with the amount of elements in
* preceding partitions
* @param localData `Array[(Double, Double, Double)]` A local array containing the collected
* results of `searchOneSampleCandidates` across all partitions
* @param n `Double`The size of the RDD
* @return The one-sample Kolmogorov Smirnov Statistic
*/
private def searchOneSampleStatistic(localData: Array[(Double, Double, Double)], n: Double)
: Double = {
val initAcc = (Double.MinValue, 0.0)
// adjust differences based on the number of elements preceding it, which should provide
// the correct distance between empirical CDF and CDF
val results = localData.foldLeft(initAcc) { case ((prevMax, prevCt), (minCand, maxCand, ct)) =>
val adjConst = prevCt / n
val dist1 = math.abs(minCand + adjConst)
val dist2 = math.abs(maxCand + adjConst)
val maxVal = Array(prevMax, dist1, dist2).max
(maxVal, prevCt + ct)
}
results._1
}

/**
* A convenience function that allows running the KS test for 1 set of sample data against
* a named distribution
Expand Down
23 changes: 23 additions & 0 deletions mllib/src/test/scala/org/apache/spark/ml/clustering/LDASuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,17 @@

package org.apache.spark.ml.clustering

import java.io.File

import org.apache.spark.SparkFunSuite
import org.apache.spark.graphx.Edge
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.clustering.{DistributedLDAModel, LocalLDAModel}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.util.Utils


object LDASuite {
Expand Down Expand Up @@ -261,4 +266,22 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
testEstimatorAndModelReadWrite(lda, dataset,
LDASuite.allParamSettings ++ Map("optimizer" -> "em"), checkModelData)
}

test("load LDAModel") {
val lda = new LDA().setK(k).setSeed(1).setOptimizer("em").setMaxIter(2)
val distributedModel = lda.fit(dataset)
val localModel = lda.setOptimizer("online").fit(dataset)

val tempDir1 = Utils.createTempDir()
val distributedPath = new File(tempDir1, "distributed").getPath
val localPath = new File(tempDir1, "local").getPath
try {
distributedModel.save(distributedPath)
localModel.save(localPath)
assert(LDAModel.load(distributedPath).isInstanceOf[DistributedLDAModel])
assert(LDAModel.load(localPath).isInstanceOf[LocalLDAModel])
} finally {
Utils.deleteRecursively(tempDir1)
}
}
}