From 9d7b00b08423dae455e57e1f00fa6471f8fd901c Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Mon, 22 Dec 2014 02:47:54 +0100 Subject: [PATCH 01/14] Initial commit for Kernels feature: Contains class heirarchies, implementation of the Nystrom method for feature map extractions, RBF and Polynomial Kernels. Also a bare bones test suite for SVM Kernels is included --- .../spark/mllib/kernels/DensityKernel.scala | 45 +++++ .../apache/spark/mllib/kernels/Kernel.scala | 40 ++++ .../mllib/kernels/PolynomialKernel.scala | 47 +++++ .../spark/mllib/kernels/RBFKernel.scala | 48 +++++ .../spark/mllib/kernels/SVMKernel.scala | 180 ++++++++++++++++++ .../spark/mllib/kernels/KernelSuite.scala | 84 ++++++++ 6 files changed, 444 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala new file mode 100644 index 0000000000000..abab0778f588f --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import org.apache.spark.mllib.linalg.{Vectors, Vector} +import org.apache.spark.rdd.RDD + +/** + * Abstract class which can be extended to + * implement various Multivariate Density + * Kernels. + */ +abstract class DensityKernel extends Kernel with Serializable{ + protected var bandwidth: Vector + + protected def setBandwidth(b: Vector): Unit = { + this.bandwidth = b + } + + protected def eval(x: Vector):Double + + override def evaluate(x: Vector, y: Vector): Double = + this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze))) + + /** + * Calculates the AMISE (Asymptotic Mean Integrated Square Error) + * optimal bandwidth assignment by 'solve the equation plug in method' + **/ + protected def optimalBandwidth(data: RDD[Vector]): Vector + +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala new file mode 100644 index 0000000000000..4730bf5dc5854 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import org.apache.spark.mllib.linalg._ +/** + * Declares a trait Kernel which would serve + * as a base trait for all classes implementing + * Machine Learning Kernels. + * + **/ + +trait Kernel { + + /** + * Evaluates the value of the kernel given two + * vectorial parameters + * + * @param x a local Vector. + * @param y a local Vector. + * + * @return the value of the Kernel function. + * + * */ + def evaluate(x: Vector, y:Vector): Double +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala new file mode 100644 index 0000000000000..cc449477ca173 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import org.apache.spark.Logging +import org.apache.spark.mllib.linalg +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD + +/** + * Standard Polynomial SVM Kernel + * of the form K(Xi,Xj) = (Xi^T * Xj + d)^r + */ +class PolynomialKernel(private var degree: Int, + private var offset: Double) + extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{ + + def setDegree(d: Int): Unit = { + this.degree = d + } + + def setOffset(o: Int): Unit = { + this.offset = o + } + + override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = + Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree) + + override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + length: Long): + KernelMatrix[RDD[((Int, Int), Double)]] = + SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala new file mode 100644 index 0000000000000..e6ff82033f493 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import breeze.linalg.{DenseVector, norm} +import org.apache.spark.Logging +import org.apache.spark.mllib.linalg +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD + +/** + * Standard RBF Kernel of the form + * K(Xi,Xj) = exp(-||Xi - Xj||**2/2*bandwidth**2) + */ + +class RBFKernel(private var bandwidth: Double) + extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{ + + def setBandwidth(d: Double): Unit = { + this.bandwidth = d + } + + override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = { + val diff: linalg.Vector = Vectors.fromBreeze(x.toBreeze - y.toBreeze) + Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2))) + } + + override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + length: Long): + KernelMatrix[RDD[((Int, Int), Double)]] = + SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) + +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala new file mode 100644 index 0000000000000..c5b70fc35ef37 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import breeze.linalg.{DenseVector} +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.mllib.linalg +import org.apache.spark.{SparkContext, Logging} +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD +/** + * Defines an abstract class outlines the basic + * functionality requirements of an SVM Kernel + */ +abstract class SVMKernel[T] extends Kernel with Logging with Serializable { + + def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + length: Long): KernelMatrix[T] + +} + +/** + * Defines a global singleton object + * [[SVMKernel]] which has useful functions + * while working with [[RDD]] of [[LabeledPoint]] + * + * */ +object SVMKernel extends Logging with Serializable{ + + /** + * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]] + * + * @param data : An [[RDD]] of [[LabeledPoint]] + * + * @return An (Int, LabeledPoint) Key-Value RDD indexed + * from 0 to data.count() - 1 + * */ + def indexedRDD(data: RDD[LabeledPoint]): RDD[(Int, LabeledPoint)] = { + val sc = data.context + val i = sc.accumulator(-1, "Raw Data Index") + + data.map((point) => { + i+=1 + (i.localValue, point) + }) + } + + + /** + * This function constructs an [[SVMKernelMatrix]] + * + * @param mappedData The indexed [[RDD]] of [[LabeledPoint]] + * @param length Length of the indexed [[RDD]] + * @param eval A function which calculates the value of the Kernel + * given two Vectors [[linalg.Vector]]. + * + * @return An [[SVMKernelMatrix]] object. + * + * */ + def buildSVMKernelMatrix(mappedData: RDD[(Int, LabeledPoint)], + length: Long, + eval: (linalg.Vector, linalg.Vector) => Double): + KernelMatrix[RDD[((Int, Int), Double)]] = { + + logInfo("Constructing key-value representation of kernel matrix.") + logInfo("Dimension: " + length + " x " + length) + + val labels = mappedData.map((p) => (p._1, p._2.label)) + val kernel = mappedData.cartesian(mappedData) + .map((prod) => ((prod._1._1, prod._2._1), + eval(prod._1._2.features, prod._2._2.features))) + kernel.cache() + new SVMKernelMatrix(kernel, length, labels) + } + +} + +/** + * Defines a trait which outlines the basic + * functionality of Kernel Matrices. + * */ +trait KernelMatrix[T] extends Serializable{ + protected val kernel: T + def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] + def getKernelMatrix(): T = this.kernel +} + +class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], + private val dimension: Long, + private val labels: RDD[(Int, Double)]) + extends KernelMatrix[RDD[((Int, Int), Double)]] with Logging with Serializable { + + override def getKernelMatrix():RDD[((Int, Int), Double)] = this.kernel + + /** + * Defines a function value which + * calculates the multiplication of + * the Kernel Matrix with a Breeze + * Vector and returns the result as a + * Breeze DenseVector. + * */ + val multiplyKernelMatrixOn = + (v :breeze.linalg.DenseVector[Double]) => { + val vbr = kernel.context.broadcast(v) + v.mapPairs((i, _) => { + //Get row number i of kernel + val row = kernel.filter((point) => i == point._1._1) + //multiply with v + var sum = kernel.context.accumulator(0.00, "Multiplication product, vector") + row.foreach((rownum) => { + sum += rownum._2*vbr.value(rownum._1._2) + }) + sum.value + }) + } + + /** + * Builds an approximate nonlinear feature map + * which corresponds to an SVM Kernel. This is + * done using the Nystrom method i.e. approximating + * the eigenvalues and eigenvectors of the Kernel + * matrix of a given RDD + * + * @param dimensions The effective number of dimensions + * to be calculated in the feature map + * + * @return An RDD containing the non linear feature map + * of all the data points passed to the function. + * + * */ + def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] = { + + + logInfo("Eigenvalue decomposition of the kernel matrix using ARPACK.") + val decomposition = EigenValueDecomposition + .symmetricEigs( + multiplyKernelMatrixOn, + dimension.toInt, dimensions, + 0.0001, 300) + + logInfo("Applying Nystrom formula to calculate feature map of kernel matrix") + + //TODO: Comment here + + val rows = kernel.groupBy((couple) => { + couple._1._1 + }) + + val temp = labels.join(rows) + + temp.map((datapoint) => { + val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i => + val eigenvector = decomposition._2(::, i) + val eigenvalue = decomposition._1(i) + var acc = 0.0 + datapoint._2._2.foreach((p) => + acc += (p._2 * eigenvector(p._1._2)/Math.sqrt(eigenvalue)) + ) + acc + } + new LabeledPoint(datapoint._2._1, Vectors.fromBreeze(y)) + }) + + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala new file mode 100644 index 0000000000000..8be61ee158f73 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import org.apache.spark.mllib.classification.SVMSuite +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.scalatest.FunSuite + +class KernelSuite extends FunSuite with MLlibTestSparkContext { + test("Testing evaluate function of Polynomial and RBF Functions"){ + + val nPoints = 100 + + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 + val B = -1.5 + val C = 1.0 + + val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42) + + val testRDD = sc.parallelize(testData) + + val rbf = new RBFKernel(1.00) + val poly = new PolynomialKernel(2, 1.5) + + val mappedData = SVMKernel.indexedRDD(testRDD) + + val kernelMatrix1 = poly.buildKernelMatrixasRDD(mappedData, nPoints) + val kernelMatrix2 = rbf.buildKernelMatrixasRDD(mappedData, nPoints) + + assert(mappedData.count() == nPoints) + assert(kernelMatrix1.getKernelMatrix().filter((point) => + point._2.isNaN || point._2.isInfinite) + .count() == 0) + assert(kernelMatrix2.getKernelMatrix().filter((point) => + point._2.isNaN || point._2.isInfinite) + .count() == 0) + + } + + test("Testing building of feature map from the kernel matrix"){ + val nPoints = 100 + + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 + val B = -1.5 + val C = 1.0 + + val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42) + + val testRDD = sc.parallelize(testData, 2) + testRDD.cache() + + val rbf = new RBFKernel(1.00) + val poly = new PolynomialKernel(5, 1.5) + val mappedData = SVMKernel.indexedRDD(testRDD) + + mappedData.cache() + val kernelMatrixpoly = poly.buildKernelMatrixasRDD(mappedData, nPoints) + val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(mappedData, nPoints) + + assert(mappedData.count() == nPoints) + val mappedFeaturespoly = kernelMatrixpoly.buildFeatureMap(3) + val mappedFeaturesrbf = kernelMatrixRBF.buildFeatureMap(5) + + assert(mappedFeaturespoly.filter((point) => point.features.size == 3).count() == 100) + assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100) + + } +} From 2e29b123665b19f3f06d5f4f90d7884646da71a6 Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Wed, 21 Jan 2015 18:34:29 +0100 Subject: [PATCH 02/14] 1) Changes to class hierarchy 2) Commenting Nystrom approximation code --- .../spark/mllib/kernels/DensityKernel.scala | 9 +- .../mllib/kernels/GaussianDensityKernel.scala | 82 +++++++++++++++++++ .../spark/mllib/kernels/RBFKernel.scala | 2 +- .../spark/mllib/kernels/SVMKernel.scala | 19 ++++- 4 files changed, 102 insertions(+), 10 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala index abab0778f588f..baec10ceceeaa 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala @@ -24,14 +24,9 @@ import org.apache.spark.rdd.RDD * implement various Multivariate Density * Kernels. */ -abstract class DensityKernel extends Kernel with Serializable{ - protected var bandwidth: Vector +trait DensityKernel extends Kernel with Serializable{ - protected def setBandwidth(b: Vector): Unit = { - this.bandwidth = b - } - - protected def eval(x: Vector):Double + def eval(x: Vector):Double override def evaluate(x: Vector, y: Vector): Double = this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze))) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala new file mode 100644 index 0000000000000..e133ed69d78bd --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.kernels + +import breeze.linalg.{norm, DenseVector} +import org.apache.spark.Logging +import org.apache.spark.mllib.linalg +import org.apache.spark.mllib.linalg.{Vectors, Vector} +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD + + +class GaussianDensityKernel(protected var bandwidth: Vector) + extends DensityKernel with Logging { + + def setBandwidth(b: linalg.Vector): Unit = { + this.bandwidth = b + } + + override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth) + + private def evalWithBandwidth(x: Vector, b: Vector): Double = { + val exp = scala.math.exp _ + val pow = scala.math.pow _ _ + val sqrt = scala.math.sqrt _ + val Pi = scala.math.Pi + + val buff = x.toBreeze + + val normalizedbuff: breeze.linalg.DenseVector[Double] = DenseVector.tabulate( + b.size)( + (i) => buff(i)/b.apply(i) + ) + exp(-1*pow(norm(normalizedbuff), 2)/2)/sqrt(Pi * 2) + } + + //TODO: Implement derivative function + private def derivative(n: Int)(x: Vector): Vector = { + Vectors.zeros(x.size) + } + + //TODO: Implement R integral + private def R(r: Int, pilot: Vector): Vector = { + Vectors.zeros(pilot.size) + } + + //TODO: Implement mu integral + private val mu: Vector = Vectors.zeros(this.bandwidth.size) + + override def optimalBandwidth(data: RDD[Vector]): Unit = { + + //First calculate variance of all dimensions + val columnStats = Statistics.colStats(data) + + val colvariance = columnStats.variance + + //Now calculate the initial estimates of R(f'''') and R(f'''''') + + //Use the earlier result to calculate h1 and h2 bandwidths for each + //dimension separately + + //Use the Sheathon and Jones 1991 result to calculate + //the optimal bandwidth + + //Vectors.fromBreeze(breeze.linalg.DenseVector.ones[Double](10)) + } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala index e6ff82033f493..fac11439fc192 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala @@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD */ class RBFKernel(private var bandwidth: Double) - extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{ + extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable { def setBandwidth(d: Double): Unit = { this.bandwidth = d diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala index c5b70fc35ef37..ff4d0d054e6bb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -155,14 +155,29 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], logInfo("Applying Nystrom formula to calculate feature map of kernel matrix") - //TODO: Comment here - + /* + * Get row number i of the + * Kernel Matrix + * */ val rows = kernel.groupBy((couple) => { couple._1._1 }) + /* + * Join the each row i with the + * target label for point i. + * */ val temp = labels.join(rows) + /* + * Now for each data point, + * calculate n dimensions of the + * feature map where n is the number + * of eigenvalues/vectors obtained from + * the Eigen Decomposition. + * + * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, n, K(k, x)*eigenvector(i)(k)) + * */ temp.map((datapoint) => { val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i => val eigenvector = decomposition._2(::, i) From 5239082426911032df7228163a94eca8281e5676 Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Tue, 27 Jan 2015 17:03:26 +0100 Subject: [PATCH 03/14] Added optimal bandwidth selection procedure for Gaussian Density Kernel --- .../spark/mllib/kernels/DensityKernel.scala | 11 +- .../mllib/kernels/GaussianDensityKernel.scala | 181 +++++++++++++++--- .../spark/mllib/kernels/KernelEstimator.scala | 41 ++++ .../mllib/kernels/PolynomialKernel.scala | 6 +- .../spark/mllib/kernels/RBFKernel.scala | 6 +- .../spark/mllib/kernels/SVMKernel.scala | 25 ++- .../spark/mllib/kernels/KernelSuite.scala | 18 ++ 7 files changed, 232 insertions(+), 56 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala index baec10ceceeaa..6658c5343ace3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala @@ -17,6 +17,7 @@ package org.apache.spark.mllib.kernels import org.apache.spark.mllib.linalg.{Vectors, Vector} +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD /** @@ -24,17 +25,15 @@ import org.apache.spark.rdd.RDD * implement various Multivariate Density * Kernels. */ -trait DensityKernel extends Kernel with Serializable{ +trait DensityKernel extends Kernel with Serializable { def eval(x: Vector):Double override def evaluate(x: Vector, y: Vector): Double = this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze))) - /** - * Calculates the AMISE (Asymptotic Mean Integrated Square Error) - * optimal bandwidth assignment by 'solve the equation plug in method' - **/ - protected def optimalBandwidth(data: RDD[Vector]): Vector + protected def derivative(n: Int, x: Double): Double + protected val mu: Double + protected val r: Double } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala index e133ed69d78bd..6c7621e11a208 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala @@ -23,60 +23,179 @@ import org.apache.spark.mllib.linalg import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD +import breeze.numerics.{sqrt => brsqrt} -class GaussianDensityKernel(protected var bandwidth: Vector) - extends DensityKernel with Logging { +class GaussianDensityKernel + extends DensityKernel with KernelEstimator with Logging with Serializable { + private val exp = scala.math.exp _ + private val pow = scala.math.pow _ + private val sqrt = scala.math.sqrt _ + private val Pi = scala.math.Pi + protected var bandwidth: Vector = Vectors.zeros(10) - def setBandwidth(b: linalg.Vector): Unit = { - this.bandwidth = b - } - - override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth) + private def evalForDimension(x: Double, pilot: Double): Double = + exp(-1*pow(x/pilot, 2)/2)/sqrt(Pi * 2) private def evalWithBandwidth(x: Vector, b: Vector): Double = { - val exp = scala.math.exp _ - val pow = scala.math.pow _ _ - val sqrt = scala.math.sqrt _ - val Pi = scala.math.Pi - + assert(x.size == b.size, + "Dimensions of vector x and the bandwidth of the kernel must match") val buff = x.toBreeze - + val bw = b.toBreeze val normalizedbuff: breeze.linalg.DenseVector[Double] = DenseVector.tabulate( - b.size)( - (i) => buff(i)/b.apply(i) + bw.size)( + (i) => buff(i)/bw(i) ) - exp(-1*pow(norm(normalizedbuff), 2)/2)/sqrt(Pi * 2) + exp(-1*pow(norm(normalizedbuff), 2)/2)/pow(sqrt(Pi * 2), b.size) } - //TODO: Implement derivative function - private def derivative(n: Int)(x: Vector): Vector = { - Vectors.zeros(x.size) + /* + * Calculate the value of the hermite polynomials + * tail recursively. This is needed to calculate + * the Gaussian derivatives at a point x. + * */ + private def hermite(n: Int, x: Double): Double = { + def hermiteHelper(k: Int, x: Double, a: Double, b: Double): Double = + k match { + case 0 => a + case 1 => b + case _ => hermiteHelper(k-1, x, b, x*b - (k-1)*a) + } + hermiteHelper(n, x, 1, x) } - //TODO: Implement R integral - private def R(r: Int, pilot: Vector): Vector = { - Vectors.zeros(pilot.size) + def setBandwidth(b: linalg.Vector): Unit = { + this.bandwidth = b } - //TODO: Implement mu integral - private val mu: Vector = Vectors.zeros(this.bandwidth.size) + override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth) + + /** + * Calculates the derivative at point x for the Gaussian + * Density Kernel, for only one dimension. + * + * @param n The number of times the gaussian has to be differentiated + * @param x The point x at which the derivative has to evaluated + * @return The value of the nth derivative at the point x + * */ + override def derivative(n: Int, x: Double): Double = { + (1/sqrt(2*Pi))*(1/pow(-1.0,n))*exp(-1*pow(x,2)/2)*hermite(n, x) + } + + /** + * Implementation of the estimator for the R integral + * for a multivariate Gaussian Density Kernel. + * Evaluates R(D_r(f(x))). + * + * @param r the degree of the derivative of the kernel + * + * @param N The size of the original data set from which + * kernel matrix [[RDD]] was constructed. + * + * @param pilot The pilot bandwidth to be used to calculate + * the kernel values. (Note that we have not calculated + * the AMISE bandwidth yet and we use this estimator + * as a means to get the AMISE bandwidth) + * + * @param kernel The RDD containing the kernel matrix + * consisting of pairs Xi - Xj, where Xi and Xj + * are drawn from the original data set. + * + * @return R the estimated value of the integral of the square + * of the rth derivative of the kernel over the Real domain. + * */ + override protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double], + kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = { + + + /* + * Apply map to get values of the derivative of the kernel + * at various point pairs. + * */ + val kernelNormalized = kernel.map((couple) => + (couple._1, Vectors.fromBreeze(DenseVector.tabulate(pilot.size) + ((i) => (1/(pow(N, 2)*pow(pilot(i), r + 1)))* + this.derivative(r, couple._2.toBreeze(i)/pilot(i))) + ))) + + /* + * Sum up all the individual values to get the estimated + * value of the integral + * */ + val integralvalue = kernelNormalized.reduce((a,b) => + ((0,0), Vectors.fromBreeze(a._2.toBreeze + b._2.toBreeze))) + + integralvalue._2.toBreeze + } + + override protected val mu = (1/4)*(1/sqrt(Pi)) + override protected val r = (1/2)*(1/sqrt(Pi)) + + /** + * Use the Sheather and Jones plug-in + * method to calculate the optimal bandwidth + * http://bit.ly/1EoBY7q + * + * */ override def optimalBandwidth(data: RDD[Vector]): Unit = { + val dataSize: Long = data.count() //First calculate variance of all dimensions val columnStats = Statistics.colStats(data) + // And then the standard deviation + val colvar = columnStats.variance.toBreeze + val colstd = colvar.map((v) => sqrt(v)) + + //Now calculate the initial estimates of R(f^6) and R(f^8) + + /*val Rf6: DenseVector[Double] = DenseVector.tabulate(colstd.size)( + (i) => -15.0*pow(colstd(i), -7.0)/(16*sqrt(Pi)))*/ + + val Rf8: DenseVector[Double] = DenseVector.tabulate(colstd.size)( + (i) => 105*pow(colstd(i), -9.0)/(32*sqrt(Pi))) + + /* + * Use the earlier result to calculate + * h1 and h2 bandwidths for each dimension + * */ + + /*val h1: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) => + pow(-2*this.derivative(4, 0.0)/(dataSize*this.mu*Rf6(i)), 1/7))*/ + val h2: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) => + pow(-2*this.derivative(6, 0.0)/(dataSize*this.mu*Rf8(i)), 1/9)) + + + /* + * Use h1 and h2 to calculate more + * refined estimates of R(f^6) and R(f^8) + * */ + + //Get an 0-indexed version of the original data set + val mappedData = SVMKernel.indexedRDD(data) + + /* + * Apply cartesian product on the indexed data set + * and then map it to a RDD of type [(i,j), Xi - Xj] + * */ + val kernel = mappedData.cartesian(mappedData) + .map((prod) => ((prod._1._1, prod._2._1), + Vectors.fromBreeze(prod._1._2.toBreeze - + prod._2._2.toBreeze)) + ) + kernel.cache() + - val colvariance = columnStats.variance + val newRf6: breeze.linalg.Vector[Double] = this.R(8, dataSize, h2, kernel) - //Now calculate the initial estimates of R(f'''') and R(f'''''') + val hAMSE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) => + pow((-2*this.derivative(4, 0.0))/(dataSize*this.mu*newRf6(i)), 1/7)) - //Use the earlier result to calculate h1 and h2 bandwidths for each - //dimension separately + val newRf4: breeze.linalg.Vector[Double] = this.R(4, dataSize, hAMSE, kernel) - //Use the Sheathon and Jones 1991 result to calculate - //the optimal bandwidth + val hAMISE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) => + pow(this.r/(dataSize*this.mu*this.mu*newRf4(i)), 1/5)) - //Vectors.fromBreeze(breeze.linalg.DenseVector.ones[Double](10)) + this.bandwidth = Vectors.fromBreeze(hAMISE) } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala new file mode 100644 index 0000000000000..dedbd4c3a6264 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.kernels + +import org.apache.spark.Logging +import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector} +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD + +/** + * Trait defining the basic behavior + * of a Kernel density estimator + */ +trait KernelEstimator extends Logging { + + protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double], + kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] + + + /** + * Calculate the AMISE (Asymptotic Mean Integrated Square Error) + * optimal bandwidth assignment by 'solve the equation plug in method' + **/ + def optimalBandwidth(data: RDD[Vector]): Unit + +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala index cc449477ca173..51abfad97c060 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala @@ -27,7 +27,7 @@ import org.apache.spark.rdd.RDD */ class PolynomialKernel(private var degree: Int, private var offset: Double) - extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{ + extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable{ def setDegree(d: Int): Unit = { this.degree = d @@ -40,8 +40,8 @@ class PolynomialKernel(private var degree: Int, override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree) - override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], length: Long): - KernelMatrix[RDD[((Int, Int), Double)]] = + KernelMatrix[RDD[((Long, Long), Double)]] = SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala index fac11439fc192..d5c9285e8c394 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala @@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD */ class RBFKernel(private var bandwidth: Double) - extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable { + extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable { def setBandwidth(d: Double): Unit = { this.bandwidth = d @@ -40,9 +40,9 @@ class RBFKernel(private var bandwidth: Double) Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2))) } - override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], length: Long): - KernelMatrix[RDD[((Int, Int), Double)]] = + KernelMatrix[RDD[((Long, Long), Double)]] = SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala index ff4d0d054e6bb..74bec1050f913 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -16,8 +16,7 @@ */ package org.apache.spark.mllib.kernels -import breeze.linalg.{DenseVector} -import org.apache.spark.annotation.DeveloperApi +import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg import org.apache.spark.{SparkContext, Logging} import org.apache.spark.mllib.linalg._ @@ -29,7 +28,7 @@ import org.apache.spark.rdd.RDD */ abstract class SVMKernel[T] extends Kernel with Logging with Serializable { - def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], length: Long): KernelMatrix[T] } @@ -50,9 +49,9 @@ object SVMKernel extends Logging with Serializable{ * @return An (Int, LabeledPoint) Key-Value RDD indexed * from 0 to data.count() - 1 * */ - def indexedRDD(data: RDD[LabeledPoint]): RDD[(Int, LabeledPoint)] = { + def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = { val sc = data.context - val i = sc.accumulator(-1, "Raw Data Index") + val i: org.apache.spark.Accumulator[Long] = sc.accumulator(-1, "Raw Data Index") data.map((point) => { i+=1 @@ -72,10 +71,10 @@ object SVMKernel extends Logging with Serializable{ * @return An [[SVMKernelMatrix]] object. * * */ - def buildSVMKernelMatrix(mappedData: RDD[(Int, LabeledPoint)], + def buildSVMKernelMatrix(mappedData: RDD[(Long, LabeledPoint)], length: Long, eval: (linalg.Vector, linalg.Vector) => Double): - KernelMatrix[RDD[((Int, Int), Double)]] = { + KernelMatrix[RDD[((Long, Long), Double)]] = { logInfo("Constructing key-value representation of kernel matrix.") logInfo("Dimension: " + length + " x " + length) @@ -100,12 +99,12 @@ trait KernelMatrix[T] extends Serializable{ def getKernelMatrix(): T = this.kernel } -class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], +class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)], private val dimension: Long, - private val labels: RDD[(Int, Double)]) - extends KernelMatrix[RDD[((Int, Int), Double)]] with Logging with Serializable { + private val labels: RDD[(Long, Double)]) + extends KernelMatrix[RDD[((Long, Long), Double)]] with Logging with Serializable { - override def getKernelMatrix():RDD[((Int, Int), Double)] = this.kernel + override def getKernelMatrix():RDD[((Long, Long), Double)] = this.kernel /** * Defines a function value which @@ -123,7 +122,7 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], //multiply with v var sum = kernel.context.accumulator(0.00, "Multiplication product, vector") row.foreach((rownum) => { - sum += rownum._2*vbr.value(rownum._1._2) + sum += rownum._2*vbr.value(rownum._1._2.toInt) }) sum.value }) @@ -184,7 +183,7 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], val eigenvalue = decomposition._1(i) var acc = 0.0 datapoint._2._2.foreach((p) => - acc += (p._2 * eigenvector(p._1._2)/Math.sqrt(eigenvalue)) + acc += (p._2 * eigenvector(p._1._2.toInt)/Math.sqrt(eigenvalue)) ) acc } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala index 8be61ee158f73..6ac6ae9b33e18 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -81,4 +81,22 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100) } + + test("Testing optimal bandwidth calculation on Gaussian Kernel"){ + val nPoints = 100 + + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 + val B = -1.5 + val C = 1.0 + + val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42) + + val testRDD = sc.parallelize(testData, 2) + val newtestRDD = testRDD.map((p) => p.features) + newtestRDD.cache() + val kern = new GaussianDensityKernel() + kern.optimalBandwidth(newtestRDD) + assert(kern.eval(newtestRDD.first()) != Double.NaN) + } } From bf1e9829b798ca76d408f6d1a66220720948c038 Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Thu, 29 Jan 2015 14:42:53 +0100 Subject: [PATCH 04/14] Entropy based subset selection done, unit tests passing --- .../spark/mllib/kernels/SVMKernel.scala | 20 ++- .../mllib/prototype/EntropyMeasure.scala | 49 +++++++ .../mllib/prototype/EntropySelector.scala | 127 ++++++++++++++++++ .../spark/mllib/prototype/Measure.scala | 28 ++++ .../prototype/QuadraticRenyiEntropy.scala | 60 +++++++++ .../mllib/prototype/SubsetSelector.scala | 28 ++++ .../spark/mllib/kernels/KernelSuite.scala | 20 ++- 7 files changed, 319 insertions(+), 13 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala index 74bec1050f913..5321e55a07c70 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -39,7 +39,7 @@ abstract class SVMKernel[T] extends Kernel with Logging with Serializable { * while working with [[RDD]] of [[LabeledPoint]] * * */ -object SVMKernel extends Logging with Serializable{ +object SVMKernel extends Logging with Serializable { /** * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]] @@ -49,15 +49,7 @@ object SVMKernel extends Logging with Serializable{ * @return An (Int, LabeledPoint) Key-Value RDD indexed * from 0 to data.count() - 1 * */ - def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = { - val sc = data.context - val i: org.apache.spark.Accumulator[Long] = sc.accumulator(-1, "Raw Data Index") - - data.map((point) => { - i+=1 - (i.localValue, point) - }) - } + def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = data.zipWithIndex().map((p) => (p._2, p._1)) /** @@ -87,13 +79,19 @@ object SVMKernel extends Logging with Serializable{ new SVMKernelMatrix(kernel, length, labels) } + def zipVectorsWithLabels(mappedData: RDD[(Long, Vector)], + labels: RDD[(Long, Double)]): + RDD[LabeledPoint] = mappedData.join(labels).map((point) => + new LabeledPoint(point._2._2, point._2._1)) + + } /** * Defines a trait which outlines the basic * functionality of Kernel Matrices. * */ -trait KernelMatrix[T] extends Serializable{ +trait KernelMatrix[T] extends Serializable { protected val kernel: T def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] def getKernelMatrix(): T = this.kernel diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala new file mode 100644 index 0000000000000..73bcfa3aab30e --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.prototype + +import org.apache.spark.mllib.kernels.DensityKernel +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.linalg.Vector + +/** + * Models a general entropy measure. + * Any entropy measure would require a + * probability distribution + */ +abstract class EntropyMeasure extends Measure[Vector] +with Serializable{ + + protected val density: DensityKernel + + /** + * Given a probability distribution for + * the data set, calculate the entropy of + * the data set with respect to the given + * distribution. + * + * @param data The data set whose entropy is + * required. + * + * @return The entropy of the data set. + * */ + + def entropy[K](data: RDD[(K, Vector)]): Double + + override def evaluate[K](data: RDD[(K, Vector)]): Double = this.entropy(data) +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala new file mode 100644 index 0000000000000..1543919c1fe53 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.prototype + +import org.apache.spark.{SparkContext, Logging} +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.rdd.RDD + +/** + * Basic skeleton of an entropy based + * subset selector + */ +abstract class EntropySelector + extends SubsetSelector[(Long, Vector)] with Serializable + with Logging { + protected val measure: EntropyMeasure + protected val delta: Double + protected val MAX_ITERATIONS: Int +} + +class GreedyEntropySelector(m: EntropyMeasure, + del: Double = 0.0001, + max: Int = 5000) + extends EntropySelector with Serializable + with Logging { + + override protected val measure: EntropyMeasure = m + override protected val delta: Double = del + override protected val MAX_ITERATIONS: Int = max + + override def selectPrototypes(data: RDD[(Long, Vector)], + M: Int): RDD[(Long, Vector)] = { + + val context = data.context + + /* + * Draw an initial sample of M points + * from data without replacement. + * + * Define a working set which we + * will use as a prototype set to + * to each iteration + * */ + + val workingset = data.keys.takeSample(false, M) + + val r = scala.util.Random + var it: Int = 0 + + //All the elements not in the working set + var newDataset: RDD[Long] = data.keys.filter((p) => !workingset.contains(p)) + //Existing best value of the entropy + var oldEntropy: Double = this.measure.evaluate(data.filter((point) => + workingset.contains(point._1))) + //Store the value of entropy after an element swap + var newEntropy: Double = 0.0 + var d: Double = Double.NegativeInfinity + var rand: Int = 0 + do { + /* + * Randomly select a point from + * the working set as well as data + * and then swap them. + * */ + rand = r.nextInt(workingset.length - 1) + val point1 = workingset.apply(rand) + + val point2 = newDataset.takeSample(false, 1).apply(0) + + //Update the working set + workingset(rand) = point2 + //Calculate the new entropy + newEntropy = this.measure.evaluate(data.filter((p) => + workingset.contains(p._1))) + + /* + * Calculate the change in entropy, + * if it has improved then keep the + * swap, otherwise revert to existing + * working set. + * */ + d = newEntropy - oldEntropy + + if(d < 0) { + /* + * Improvement in entropy so + * keep the updated working set + * as it is and update the + * variable 'newDataset' + * */ + oldEntropy = newEntropy + newDataset = data.keys.filter((p) => !workingset.contains(p)) + } else { + /* + * No improvement in entropy + * so revert the working set + * to its initial state. Leave + * the variable newDataset as + * it is. + * */ + workingset(rand) = point1 + } + + it += 1 + } while(math.abs(d) >= this.delta && + it <= this.MAX_ITERATIONS) + + //Time to return the final working set + data.filter((p) => workingset.contains(p._1)) + } + +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala new file mode 100644 index 0000000000000..80d466fb18ee3 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.prototype + +import org.apache.spark.rdd.RDD + +/** + * Trait which outlines basic behavior + * of a subset utility measure. + */ +trait Measure[T] { + def evaluate[K](data: RDD[(K, T)]): Double +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala new file mode 100644 index 0000000000000..d2fcbaef381e8 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.prototype + +import breeze.linalg.DenseVector +import org.apache.spark.Logging +import org.apache.spark.mllib.kernels.DensityKernel +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.linalg.{Vectors, Vector} + +/** + * Implements the quadratic Renyi Entropy + */ +class QuadraticRenyiEntropy(dist: DensityKernel) + extends EntropyMeasure with Serializable with Logging { + + val log_e = scala.math.log _ + val sqrt = scala.math.sqrt _ + + override protected val density: DensityKernel = dist + + /** + * Calculate the quadratic Renyi entropy + * within a distribution specific + * proportionality constant. This can + * be used to compare the entropy values of + * different sets of data on the same + * distribution. + * + * @param data The data set whose entropy is + * required. + * @return The entropy of the dataset assuming + * it is distributed as given by the value + * parameter 'density'. + * */ + + override def entropy[K](data: RDD[(K, Vector)]): Double = { + val dim = data.first()._2.size + val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2)) + -1*log_e(data.cartesian(data).map((couple) => + density.evaluate( + Vectors.fromBreeze(couple._1._2.toBreeze :/ root_two), + Vectors.fromBreeze(couple._2._2.toBreeze :/ root_two) + )).reduce((a,b) => a + b)) + } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala new file mode 100644 index 0000000000000..c96bcb0dd3a3e --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.prototype + +import org.apache.spark.rdd.RDD + +/** + * Defines the characteristics of + * a subset selector + */ +trait SubsetSelector[T] extends Serializable{ + def selectPrototypes(data: RDD[T], M: Int): RDD[T] +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala index 6ac6ae9b33e18..26f163ada25c2 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -16,7 +16,9 @@ */ package org.apache.spark.mllib.kernels +import breeze.linalg.norm import org.apache.spark.mllib.classification.SVMSuite +import org.apache.spark.mllib.prototype.{QuadraticRenyiEntropy, GreedyEntropySelector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.scalatest.FunSuite @@ -82,8 +84,9 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { } - test("Testing optimal bandwidth calculation on Gaussian Kernel"){ - val nPoints = 100 + test("Testing optimal bandwidth calculation on Gaussian Kernel" + + " and maximum entropy subset selection"){ + val nPoints = 10000 // NOTE: Intercept should be small for generating equal 0s and 1s val A = 0.01 @@ -98,5 +101,18 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { val kern = new GaussianDensityKernel() kern.optimalBandwidth(newtestRDD) assert(kern.eval(newtestRDD.first()) != Double.NaN) + + val newIndexedRDD = SVMKernel.indexedRDD(newtestRDD) + newIndexedRDD.cache() + newtestRDD.unpersist() + + val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern) + val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) + + val subsetRDD = subsetsel.selectPrototypes( + newIndexedRDD, + 100) + + assert(subsetRDD.count() == 100) } } From 43f85c20f1b8ca8ed7324f828df3a88295d26a3d Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Sun, 1 Feb 2015 00:26:03 +0100 Subject: [PATCH 05/14] 1) Optimized code for feature map extraction, kernel matrix multiplication 2) Code indentation changes --- .../spark/mllib/kernels/DensityKernel.scala | 11 +- .../mllib/kernels/GaussianDensityKernel.scala | 25 +-- .../apache/spark/mllib/kernels/Kernel.scala | 4 +- .../spark/mllib/kernels/KernelEstimator.scala | 9 +- .../mllib/kernels/PolynomialKernel.scala | 20 +- .../spark/mllib/kernels/RBFKernel.scala | 18 +- .../spark/mllib/kernels/SVMKernel.scala | 205 ++++++++++-------- .../mllib/prototype/EntropyMeasure.scala | 9 +- .../mllib/prototype/EntropySelector.scala | 31 +-- .../prototype/QuadraticRenyiEntropy.scala | 18 +- .../spark/mllib/kernels/KernelSuite.scala | 80 ++++++- 11 files changed, 258 insertions(+), 172 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala index 6658c5343ace3..7f8b7a06af7cc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala @@ -18,7 +18,6 @@ package org.apache.spark.mllib.kernels import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.rdd.RDD /** * Abstract class which can be extended to @@ -26,14 +25,14 @@ import org.apache.spark.rdd.RDD * Kernels. */ trait DensityKernel extends Kernel with Serializable { + protected val mu: Double + protected val r: Double def eval(x: Vector):Double - override def evaluate(x: Vector, y: Vector): Double = - this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze))) + override def evaluate(x: LabeledPoint, y: LabeledPoint): Double = + this.eval(Vectors.fromBreeze(x.features.toBreeze.-=(y.features.toBreeze))) protected def derivative(n: Int, x: Double): Double - protected val mu: Double - protected val r: Double -} + } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala index 6c7621e11a208..6de1c51c89df4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala @@ -19,20 +19,22 @@ package org.apache.spark.mllib.kernels import breeze.linalg.{norm, DenseVector} import org.apache.spark.Logging -import org.apache.spark.mllib.linalg import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD -import breeze.numerics.{sqrt => brsqrt} - class GaussianDensityKernel - extends DensityKernel with KernelEstimator with Logging with Serializable { + extends DensityKernel + with KernelEstimator + with Logging + with Serializable { private val exp = scala.math.exp _ private val pow = scala.math.pow _ private val sqrt = scala.math.sqrt _ private val Pi = scala.math.Pi protected var bandwidth: Vector = Vectors.zeros(10) + override protected val mu = (1/4)*(1/sqrt(Pi)) + override protected val r = (1/2)*(1/sqrt(Pi)) private def evalForDimension(x: Double, pilot: Double): Double = exp(-1*pow(x/pilot, 2)/2)/sqrt(Pi * 2) @@ -64,12 +66,11 @@ class GaussianDensityKernel hermiteHelper(n, x, 1, x) } - def setBandwidth(b: linalg.Vector): Unit = { + def setBandwidth(b: Vector): Unit = { this.bandwidth = b } - override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth) - + override def eval(x: Vector) = evalWithBandwidth(x, this.bandwidth) /** * Calculates the derivative at point x for the Gaussian @@ -105,9 +106,9 @@ class GaussianDensityKernel * @return R the estimated value of the integral of the square * of the rth derivative of the kernel over the Real domain. * */ - override protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double], - kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = { - + override protected def R( + r: Int, N: Long, pilot: breeze.linalg.Vector[Double], + kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = { /* * Apply map to get values of the derivative of the kernel @@ -129,9 +130,6 @@ class GaussianDensityKernel integralvalue._2.toBreeze } - override protected val mu = (1/4)*(1/sqrt(Pi)) - override protected val r = (1/2)*(1/sqrt(Pi)) - /** * Use the Sheather and Jones plug-in * method to calculate the optimal bandwidth @@ -185,7 +183,6 @@ class GaussianDensityKernel ) kernel.cache() - val newRf6: breeze.linalg.Vector[Double] = this.R(8, dataSize, h2, kernel) val hAMSE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) => diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala index 4730bf5dc5854..3d945fa6e22b5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala @@ -17,6 +17,8 @@ package org.apache.spark.mllib.kernels import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint + /** * Declares a trait Kernel which would serve * as a base trait for all classes implementing @@ -36,5 +38,5 @@ trait Kernel { * @return the value of the Kernel function. * * */ - def evaluate(x: Vector, y:Vector): Double + def evaluate(x: LabeledPoint, y: LabeledPoint): Double } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala index dedbd4c3a6264..03cc504bc34c3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala @@ -18,8 +18,7 @@ package org.apache.spark.mllib.kernels import org.apache.spark.Logging -import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector} -import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD /** @@ -28,9 +27,9 @@ import org.apache.spark.rdd.RDD */ trait KernelEstimator extends Logging { - protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double], - kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] - + protected def R( + r: Int, N: Long, pilot: breeze.linalg.Vector[Double], + kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] /** * Calculate the AMISE (Asymptotic Mean Integrated Square Error) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala index 51abfad97c060..828aca0b48570 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala @@ -17,7 +17,6 @@ package org.apache.spark.mllib.kernels import org.apache.spark.Logging -import org.apache.spark.mllib.linalg import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -25,9 +24,12 @@ import org.apache.spark.rdd.RDD * Standard Polynomial SVM Kernel * of the form K(Xi,Xj) = (Xi^T * Xj + d)^r */ -class PolynomialKernel(private var degree: Int, - private var offset: Double) - extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable{ +class PolynomialKernel( + private var degree: Int, + private var offset: Double) + extends SVMKernel[RDD[((Long, Long), Double)]] + with Logging + with Serializable{ def setDegree(d: Int): Unit = { this.degree = d @@ -37,11 +39,11 @@ class PolynomialKernel(private var degree: Int, this.offset = o } - override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = - Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree) + override def evaluate(x: LabeledPoint, y: LabeledPoint): Double = + Math.pow(x.features.toBreeze dot y.features.toBreeze + this.offset, this.degree) - override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], - length: Long): - KernelMatrix[RDD[((Long, Long), Double)]] = + override def buildKernelMatrixasRDD( + mappedData: RDD[(Long, LabeledPoint)], + length: Long): KernelMatrix[RDD[((Long, Long), Double)]] = SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala index d5c9285e8c394..3b78b159d43b1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala @@ -16,10 +16,8 @@ */ package org.apache.spark.mllib.kernels -import breeze.linalg.{DenseVector, norm} import org.apache.spark.Logging -import org.apache.spark.mllib.linalg -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -29,20 +27,22 @@ import org.apache.spark.rdd.RDD */ class RBFKernel(private var bandwidth: Double) - extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable { + extends SVMKernel[RDD[((Long, Long), Double)]] + with Logging + with Serializable { def setBandwidth(d: Double): Unit = { this.bandwidth = d } - override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = { - val diff: linalg.Vector = Vectors.fromBreeze(x.toBreeze - y.toBreeze) + override def evaluate(x: LabeledPoint, y: LabeledPoint): Double = { + val diff: Vector = Vectors.fromBreeze(x.features.toBreeze - y.features.toBreeze) Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2))) } - override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], - length: Long): - KernelMatrix[RDD[((Long, Long), Double)]] = + override def buildKernelMatrixasRDD( + mappedData: RDD[(Long, LabeledPoint)], + length: Long): KernelMatrix[RDD[((Long, Long), Double)]] = SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala index 5321e55a07c70..a4a11dc53e2d1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -16,9 +16,8 @@ */ package org.apache.spark.mllib.kernels -import breeze.linalg.DenseVector -import org.apache.spark.mllib.linalg -import org.apache.spark.{SparkContext, Logging} +import breeze.linalg.{DenseVector, DenseMatrix} +import org.apache.spark.Logging import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -28,9 +27,67 @@ import org.apache.spark.rdd.RDD */ abstract class SVMKernel[T] extends Kernel with Logging with Serializable { - def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], - length: Long): KernelMatrix[T] + /** + * Build the kernel matrix of the prototype vectors + * + * @param mappedData The prototype vectors/points + * + * @param length The number of points + * + * @return A [[KernelMatrix]] object + * + * + * */ + def buildKernelMatrixasRDD( + mappedData: RDD[(Long, LabeledPoint)], + length: Long): KernelMatrix[T] + + /** + * Builds an approximate nonlinear feature map + * which corresponds to an SVM Kernel. This is + * done using the Nystrom method i.e. approximating + * the eigenvalues and eigenvectors of the Kernel + * matrix of a given RDD + * + * For each data point, + * calculate m dimensions of the + * feature map where m is the number + * of eigenvalues/vectors obtained from + * the Eigen Decomposition. + * + * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, m, K(k, x)*eigenvector(i)(k)) + * + * @param decomposition The Eigenvalue decomposition calculated + * from the kernel matrix of the prototype + * subset. + * @param prototypes The prototype subset. + * + * @param data The dataset [[RDD]] on which the feature map + * is to be applied. + * + * */ + def featureMapping(decomposition: (DenseVector[Double], DenseMatrix[Double])) + (prototypes: RDD[(Long, LabeledPoint)]) + (data: RDD[(Long, LabeledPoint)]) + : RDD[(Long, LabeledPoint)] = { + + logInfo("Calculating the Non Linear feature map of data set") + + data.cartesian(prototypes) + .map((couple) => { + val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i => + var eigenvector = 0.0 + if (couple._2._1.toInt < decomposition._1.length) { + eigenvector = decomposition._2(couple._2._1.toInt, i) + } + val eigenvalue = decomposition._1(i) + this.evaluate(couple._1._2, couple._2._2) * eigenvector/Math.sqrt(eigenvalue) + } + (couple._1._1, (couple._1._2.label, y)) + }).reduceByKey((veca, vecb) => (veca._1, veca._2 + vecb._2)) + .map((p) => (p._1, new LabeledPoint(p._2._1, Vectors.fromBreeze(p._2._2)))) + } } /** @@ -41,6 +98,32 @@ abstract class SVMKernel[T] extends Kernel with Logging with Serializable { * */ object SVMKernel extends Logging with Serializable { + /** + * Defines a function value which + * calculates the multiplication of + * the Kernel Matrix with a Breeze + * Vector and returns the result as a + * Breeze DenseVector. + * */ + def multiplyKernelMatrixBy(kernel: RDD[((Long, Long), Double)]) + (v :breeze.linalg.DenseVector[Double]): + DenseVector[Double] = { + val vbr = kernel.context.broadcast(v) + val result: DenseVector[Double] = + DenseVector.tabulate(v.length)( + (i) => { + //Get row number i of kernel + val row = DenseVector.apply(kernel + .filter((point) => i == point._1._1) + .map((p) => p._2) + .collect()) + //dot product with v + vbr.value.t * row + } + ) + result + } + /** * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]] * @@ -49,8 +132,8 @@ object SVMKernel extends Logging with Serializable { * @return An (Int, LabeledPoint) Key-Value RDD indexed * from 0 to data.count() - 1 * */ - def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = data.zipWithIndex().map((p) => (p._2, p._1)) - + def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = + data.zipWithIndex().map((p) => (p._2, p._1)) /** * This function constructs an [[SVMKernelMatrix]] @@ -58,14 +141,15 @@ object SVMKernel extends Logging with Serializable { * @param mappedData The indexed [[RDD]] of [[LabeledPoint]] * @param length Length of the indexed [[RDD]] * @param eval A function which calculates the value of the Kernel - * given two Vectors [[linalg.Vector]]. + * given two Labeled Points [[LabeledPoint]]. * * @return An [[SVMKernelMatrix]] object. * * */ - def buildSVMKernelMatrix(mappedData: RDD[(Long, LabeledPoint)], - length: Long, - eval: (linalg.Vector, linalg.Vector) => Double): + def buildSVMKernelMatrix( + mappedData: RDD[(Long, LabeledPoint)], + length: Long, + eval: (LabeledPoint, LabeledPoint) => Double): KernelMatrix[RDD[((Long, Long), Double)]] = { logInfo("Constructing key-value representation of kernel matrix.") @@ -74,17 +158,19 @@ object SVMKernel extends Logging with Serializable { val labels = mappedData.map((p) => (p._1, p._2.label)) val kernel = mappedData.cartesian(mappedData) .map((prod) => ((prod._1._1, prod._2._1), - eval(prod._1._2.features, prod._2._2.features))) + eval(prod._1._2, prod._2._2))) kernel.cache() new SVMKernelMatrix(kernel, length, labels) } - def zipVectorsWithLabels(mappedData: RDD[(Long, Vector)], - labels: RDD[(Long, Double)]): - RDD[LabeledPoint] = mappedData.join(labels).map((point) => + def zipVectorsWithLabels( + mappedData: RDD[(Long, Vector)], + labels: RDD[(Long, Double)]): RDD[LabeledPoint] = + mappedData.join(labels).map((point) => new LabeledPoint(point._2._2, point._2._1)) - + def unzipIndexedData(mappedData: RDD[(Long, LabeledPoint)]): + RDD[LabeledPoint] = mappedData.map((p) => p._2) } /** @@ -93,38 +179,19 @@ object SVMKernel extends Logging with Serializable { * */ trait KernelMatrix[T] extends Serializable { protected val kernel: T - def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] - def getKernelMatrix(): T = this.kernel -} -class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)], - private val dimension: Long, - private val labels: RDD[(Long, Double)]) - extends KernelMatrix[RDD[((Long, Long), Double)]] with Logging with Serializable { + def eigenDecomposition(dimensions: Int): (DenseVector[Double], DenseMatrix[Double]) - override def getKernelMatrix():RDD[((Long, Long), Double)] = this.kernel + def getKernelMatrix(): T = this.kernel +} - /** - * Defines a function value which - * calculates the multiplication of - * the Kernel Matrix with a Breeze - * Vector and returns the result as a - * Breeze DenseVector. - * */ - val multiplyKernelMatrixOn = - (v :breeze.linalg.DenseVector[Double]) => { - val vbr = kernel.context.broadcast(v) - v.mapPairs((i, _) => { - //Get row number i of kernel - val row = kernel.filter((point) => i == point._1._1) - //multiply with v - var sum = kernel.context.accumulator(0.00, "Multiplication product, vector") - row.foreach((rownum) => { - sum += rownum._2*vbr.value(rownum._1._2.toInt) - }) - sum.value - }) - } +class SVMKernelMatrix( + override protected val kernel: RDD[((Long, Long), Double)], + private val dimension: Long, + private val labels: RDD[(Long, Double)]) + extends KernelMatrix[RDD[((Long, Long), Double)]] + with Logging + with Serializable { /** * Builds an approximate nonlinear feature map @@ -140,53 +207,13 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)] * of all the data points passed to the function. * * */ - def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] = { - - + override def eigenDecomposition(dimensions: Int = this.dimension.toInt): + (DenseVector[Double], DenseMatrix[Double]) = { logInfo("Eigenvalue decomposition of the kernel matrix using ARPACK.") - val decomposition = EigenValueDecomposition + EigenValueDecomposition .symmetricEigs( - multiplyKernelMatrixOn, + SVMKernel.multiplyKernelMatrixBy(kernel), dimension.toInt, dimensions, 0.0001, 300) - - logInfo("Applying Nystrom formula to calculate feature map of kernel matrix") - - /* - * Get row number i of the - * Kernel Matrix - * */ - val rows = kernel.groupBy((couple) => { - couple._1._1 - }) - - /* - * Join the each row i with the - * target label for point i. - * */ - val temp = labels.join(rows) - - /* - * Now for each data point, - * calculate n dimensions of the - * feature map where n is the number - * of eigenvalues/vectors obtained from - * the Eigen Decomposition. - * - * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, n, K(k, x)*eigenvector(i)(k)) - * */ - temp.map((datapoint) => { - val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i => - val eigenvector = decomposition._2(::, i) - val eigenvalue = decomposition._1(i) - var acc = 0.0 - datapoint._2._2.foreach((p) => - acc += (p._2 * eigenvector(p._1._2.toInt)/Math.sqrt(eigenvalue)) - ) - acc - } - new LabeledPoint(datapoint._2._1, Vectors.fromBreeze(y)) - }) - } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala index 73bcfa3aab30e..78ffbda08b3d8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala @@ -18,16 +18,15 @@ package org.apache.spark.mllib.prototype import org.apache.spark.mllib.kernels.DensityKernel +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.linalg.Vector /** * Models a general entropy measure. * Any entropy measure would require a * probability distribution */ -abstract class EntropyMeasure extends Measure[Vector] -with Serializable{ +abstract class EntropyMeasure extends Measure[LabeledPoint] with Serializable { protected val density: DensityKernel @@ -43,7 +42,7 @@ with Serializable{ * @return The entropy of the data set. * */ - def entropy[K](data: RDD[(K, Vector)]): Double + def entropy[K](data: RDD[(K, LabeledPoint)]): Double - override def evaluate[K](data: RDD[(K, Vector)]): Double = this.entropy(data) + override def evaluate[K](data: RDD[(K, LabeledPoint)]): Double = this.entropy(data) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala index 1543919c1fe53..34d94544c26a4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala @@ -17,8 +17,8 @@ package org.apache.spark.mllib.prototype -import org.apache.spark.{SparkContext, Logging} -import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.Logging +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD /** @@ -26,27 +26,29 @@ import org.apache.spark.rdd.RDD * subset selector */ abstract class EntropySelector - extends SubsetSelector[(Long, Vector)] with Serializable + extends SubsetSelector[(Long, LabeledPoint)] + with Serializable with Logging { protected val measure: EntropyMeasure protected val delta: Double protected val MAX_ITERATIONS: Int } -class GreedyEntropySelector(m: EntropyMeasure, - del: Double = 0.0001, - max: Int = 5000) - extends EntropySelector with Serializable +class GreedyEntropySelector( + m: EntropyMeasure, + del: Double = 0.0001, + max: Int = 5000) + extends EntropySelector + with Serializable with Logging { override protected val measure: EntropyMeasure = m override protected val delta: Double = del override protected val MAX_ITERATIONS: Int = max - override def selectPrototypes(data: RDD[(Long, Vector)], - M: Int): RDD[(Long, Vector)] = { - - val context = data.context + override def selectPrototypes( + data: RDD[(Long, LabeledPoint)], + M: Int): RDD[(Long, LabeledPoint)] = { /* * Draw an initial sample of M points @@ -56,7 +58,7 @@ class GreedyEntropySelector(m: EntropyMeasure, * will use as a prototype set to * to each iteration * */ - + logInfo("Initializing the working set, by drawing randomly from the training set") val workingset = data.keys.takeSample(false, M) val r = scala.util.Random @@ -71,6 +73,7 @@ class GreedyEntropySelector(m: EntropyMeasure, var newEntropy: Double = 0.0 var d: Double = Double.NegativeInfinity var rand: Int = 0 + logInfo("Starting iterative, entropy based greedy subset selection") do { /* * Randomly select a point from @@ -96,7 +99,7 @@ class GreedyEntropySelector(m: EntropyMeasure, * */ d = newEntropy - oldEntropy - if(d < 0) { + if(d > 0) { /* * Improvement in entropy so * keep the updated working set @@ -119,7 +122,7 @@ class GreedyEntropySelector(m: EntropyMeasure, it += 1 } while(math.abs(d) >= this.delta && it <= this.MAX_ITERATIONS) - + logInfo("Working set obtained, now starting process of packaging it as an RDD") //Time to return the final working set data.filter((p) => workingset.contains(p._1)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala index d2fcbaef381e8..3613dba8a723e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala @@ -19,18 +19,20 @@ package org.apache.spark.mllib.prototype import breeze.linalg.DenseVector import org.apache.spark.Logging import org.apache.spark.mllib.kernels.DensityKernel +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.linalg.{Vectors, Vector} /** * Implements the quadratic Renyi Entropy */ class QuadraticRenyiEntropy(dist: DensityKernel) - extends EntropyMeasure with Serializable with Logging { + extends EntropyMeasure + with Serializable + with Logging { val log_e = scala.math.log _ val sqrt = scala.math.sqrt _ - override protected val density: DensityKernel = dist /** @@ -48,13 +50,11 @@ class QuadraticRenyiEntropy(dist: DensityKernel) * parameter 'density'. * */ - override def entropy[K](data: RDD[(K, Vector)]): Double = { - val dim = data.first()._2.size + override def entropy[K](data: RDD[(K, LabeledPoint)]): Double = { + val dim = data.first()._2.features.size val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2)) -1*log_e(data.cartesian(data).map((couple) => - density.evaluate( - Vectors.fromBreeze(couple._1._2.toBreeze :/ root_two), - Vectors.fromBreeze(couple._2._2.toBreeze :/ root_two) - )).reduce((a,b) => a + b)) + density.eval(Vectors.fromBreeze(couple._1._2.features.toBreeze :/ root_two - + couple._2._2.features.toBreeze :/ root_two))).reduce((a,b) => a + b)) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala index 26f163ada25c2..c03ce34d0ce6c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -16,13 +16,13 @@ */ package org.apache.spark.mllib.kernels -import breeze.linalg.norm +import org.scalatest.FunSuite import org.apache.spark.mllib.classification.SVMSuite import org.apache.spark.mllib.prototype.{QuadraticRenyiEntropy, GreedyEntropySelector} import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.scalatest.FunSuite class KernelSuite extends FunSuite with MLlibTestSparkContext { + test("Testing evaluate function of Polynomial and RBF Functions"){ val nPoints = 100 @@ -76,18 +76,22 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(mappedData, nPoints) assert(mappedData.count() == nPoints) - val mappedFeaturespoly = kernelMatrixpoly.buildFeatureMap(3) - val mappedFeaturesrbf = kernelMatrixRBF.buildFeatureMap(5) + val mappedFeaturespoly = poly.featureMapping( + kernelMatrixpoly.eigenDecomposition(99) + )(mappedData)(mappedData) + val mappedFeaturesrbf = rbf.featureMapping( + kernelMatrixRBF.eigenDecomposition(99) + )(mappedData)(mappedData) - assert(mappedFeaturespoly.filter((point) => point.features.size == 3).count() == 100) - assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100) + assert(mappedFeaturespoly.filter((point) => point._2.features.size == 99).count() == 100) + assert(mappedFeaturesrbf.filter((point) => point._2.features.size == 99).count() == 100) } test("Testing optimal bandwidth calculation on Gaussian Kernel" + " and maximum entropy subset selection"){ - val nPoints = 10000 - + val nPoints = 1000 + val subsetSize = 100 // NOTE: Intercept should be small for generating equal 0s and 1s val A = 0.01 val B = -1.5 @@ -110,9 +114,63 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) val subsetRDD = subsetsel.selectPrototypes( - newIndexedRDD, - 100) + SVMKernel.indexedRDD(testRDD), + subsetSize) + + assert(subsetRDD.count() == subsetSize) + } + + test("Testing rbf kernel with subset selection and feature map extraction") { + val nPoints = 1000 + val nDimensions = 5 + val subsetSize = 100 + val unZip = SVMKernel.unzipIndexedData _ + + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 + val B = -1.5 + val C = 1.0 + + val testData = SVMSuite.generateSVMInput( + A, + Array[Double](B, C), + nPoints, + 42) + + val testRDD = sc.parallelize(testData, 2) + + val newtestRDD = testRDD.map(_.features) + newtestRDD.cache() + val kern = new GaussianDensityKernel() + kern.optimalBandwidth(newtestRDD) + newtestRDD.unpersist() + val mappedData = SVMKernel.indexedRDD(testRDD) + mappedData.cache() + + val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern) + val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) + val subsetRDD = subsetsel.selectPrototypes( + mappedData, + subsetSize) + + val rbf = new RBFKernel(0.8) + subsetRDD.cache() + + val kernelMatrixRBF = rbf.buildKernelMatrixasRDD( + SVMKernel.indexedRDD(unZip(subsetRDD)), + subsetSize) + + val featureMap = rbf.featureMapping( + kernelMatrixRBF.eigenDecomposition(nDimensions) + )(subsetRDD) _ + + val mappedFeaturesrbf = featureMap(mappedData) + + mappedFeaturesrbf.cache() + mappedData.unpersist() + + assert(mappedFeaturesrbf.count() == nPoints) + assert(mappedFeaturesrbf.first()._2.features.size == nDimensions) - assert(subsetRDD.count() == 100) } } From 025d214802621e6a8f8b05732723bbd0025f136d Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Sun, 1 Feb 2015 14:29:27 +0100 Subject: [PATCH 06/14] Minor edits to kernel test suite. --- .../org/apache/spark/mllib/kernels/KernelSuite.scala | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala index c03ce34d0ce6c..b45980f7bd972 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -85,7 +85,6 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { assert(mappedFeaturespoly.filter((point) => point._2.features.size == 99).count() == 100) assert(mappedFeaturesrbf.filter((point) => point._2.features.size == 99).count() == 100) - } test("Testing optimal bandwidth calculation on Gaussian Kernel" + @@ -110,8 +109,8 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { newIndexedRDD.cache() newtestRDD.unpersist() - val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern) - val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) + val entropy = new QuadraticRenyiEntropy(kern) + val subsetsel = new GreedyEntropySelector(entropy) val subsetRDD = subsetsel.selectPrototypes( SVMKernel.indexedRDD(testRDD), @@ -147,8 +146,8 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { val mappedData = SVMKernel.indexedRDD(testRDD) mappedData.cache() - val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern) - val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) + val entropy = new QuadraticRenyiEntropy(kern) + val subsetsel = new GreedyEntropySelector(entropy) val subsetRDD = subsetsel.selectPrototypes( mappedData, subsetSize) From 98d7730270d804e2140741ee0154cd03c2dde350 Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Mon, 22 Dec 2014 02:47:54 +0100 Subject: [PATCH 07/14] Initial commit for Kernels feature: Contains class heirarchies, implementation of the Nystrom method for feature map extractions, RBF and Polynomial Kernels. Also a bare bones test suite for SVM Kernels is included --- .../spark/mllib/kernels/DensityKernel.scala | 45 +++++ .../apache/spark/mllib/kernels/Kernel.scala | 40 ++++ .../mllib/kernels/PolynomialKernel.scala | 47 +++++ .../spark/mllib/kernels/RBFKernel.scala | 48 +++++ .../spark/mllib/kernels/SVMKernel.scala | 180 ++++++++++++++++++ .../spark/mllib/kernels/KernelSuite.scala | 84 ++++++++ 6 files changed, 444 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala new file mode 100644 index 0000000000000..abab0778f588f --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import org.apache.spark.mllib.linalg.{Vectors, Vector} +import org.apache.spark.rdd.RDD + +/** + * Abstract class which can be extended to + * implement various Multivariate Density + * Kernels. + */ +abstract class DensityKernel extends Kernel with Serializable{ + protected var bandwidth: Vector + + protected def setBandwidth(b: Vector): Unit = { + this.bandwidth = b + } + + protected def eval(x: Vector):Double + + override def evaluate(x: Vector, y: Vector): Double = + this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze))) + + /** + * Calculates the AMISE (Asymptotic Mean Integrated Square Error) + * optimal bandwidth assignment by 'solve the equation plug in method' + **/ + protected def optimalBandwidth(data: RDD[Vector]): Vector + +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala new file mode 100644 index 0000000000000..4730bf5dc5854 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import org.apache.spark.mllib.linalg._ +/** + * Declares a trait Kernel which would serve + * as a base trait for all classes implementing + * Machine Learning Kernels. + * + **/ + +trait Kernel { + + /** + * Evaluates the value of the kernel given two + * vectorial parameters + * + * @param x a local Vector. + * @param y a local Vector. + * + * @return the value of the Kernel function. + * + * */ + def evaluate(x: Vector, y:Vector): Double +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala new file mode 100644 index 0000000000000..cc449477ca173 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import org.apache.spark.Logging +import org.apache.spark.mllib.linalg +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD + +/** + * Standard Polynomial SVM Kernel + * of the form K(Xi,Xj) = (Xi^T * Xj + d)^r + */ +class PolynomialKernel(private var degree: Int, + private var offset: Double) + extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{ + + def setDegree(d: Int): Unit = { + this.degree = d + } + + def setOffset(o: Int): Unit = { + this.offset = o + } + + override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = + Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree) + + override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + length: Long): + KernelMatrix[RDD[((Int, Int), Double)]] = + SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala new file mode 100644 index 0000000000000..e6ff82033f493 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import breeze.linalg.{DenseVector, norm} +import org.apache.spark.Logging +import org.apache.spark.mllib.linalg +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD + +/** + * Standard RBF Kernel of the form + * K(Xi,Xj) = exp(-||Xi - Xj||**2/2*bandwidth**2) + */ + +class RBFKernel(private var bandwidth: Double) + extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{ + + def setBandwidth(d: Double): Unit = { + this.bandwidth = d + } + + override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = { + val diff: linalg.Vector = Vectors.fromBreeze(x.toBreeze - y.toBreeze) + Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2))) + } + + override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + length: Long): + KernelMatrix[RDD[((Int, Int), Double)]] = + SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) + +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala new file mode 100644 index 0000000000000..c5b70fc35ef37 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import breeze.linalg.{DenseVector} +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.mllib.linalg +import org.apache.spark.{SparkContext, Logging} +import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD +/** + * Defines an abstract class outlines the basic + * functionality requirements of an SVM Kernel + */ +abstract class SVMKernel[T] extends Kernel with Logging with Serializable { + + def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + length: Long): KernelMatrix[T] + +} + +/** + * Defines a global singleton object + * [[SVMKernel]] which has useful functions + * while working with [[RDD]] of [[LabeledPoint]] + * + * */ +object SVMKernel extends Logging with Serializable{ + + /** + * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]] + * + * @param data : An [[RDD]] of [[LabeledPoint]] + * + * @return An (Int, LabeledPoint) Key-Value RDD indexed + * from 0 to data.count() - 1 + * */ + def indexedRDD(data: RDD[LabeledPoint]): RDD[(Int, LabeledPoint)] = { + val sc = data.context + val i = sc.accumulator(-1, "Raw Data Index") + + data.map((point) => { + i+=1 + (i.localValue, point) + }) + } + + + /** + * This function constructs an [[SVMKernelMatrix]] + * + * @param mappedData The indexed [[RDD]] of [[LabeledPoint]] + * @param length Length of the indexed [[RDD]] + * @param eval A function which calculates the value of the Kernel + * given two Vectors [[linalg.Vector]]. + * + * @return An [[SVMKernelMatrix]] object. + * + * */ + def buildSVMKernelMatrix(mappedData: RDD[(Int, LabeledPoint)], + length: Long, + eval: (linalg.Vector, linalg.Vector) => Double): + KernelMatrix[RDD[((Int, Int), Double)]] = { + + logInfo("Constructing key-value representation of kernel matrix.") + logInfo("Dimension: " + length + " x " + length) + + val labels = mappedData.map((p) => (p._1, p._2.label)) + val kernel = mappedData.cartesian(mappedData) + .map((prod) => ((prod._1._1, prod._2._1), + eval(prod._1._2.features, prod._2._2.features))) + kernel.cache() + new SVMKernelMatrix(kernel, length, labels) + } + +} + +/** + * Defines a trait which outlines the basic + * functionality of Kernel Matrices. + * */ +trait KernelMatrix[T] extends Serializable{ + protected val kernel: T + def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] + def getKernelMatrix(): T = this.kernel +} + +class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], + private val dimension: Long, + private val labels: RDD[(Int, Double)]) + extends KernelMatrix[RDD[((Int, Int), Double)]] with Logging with Serializable { + + override def getKernelMatrix():RDD[((Int, Int), Double)] = this.kernel + + /** + * Defines a function value which + * calculates the multiplication of + * the Kernel Matrix with a Breeze + * Vector and returns the result as a + * Breeze DenseVector. + * */ + val multiplyKernelMatrixOn = + (v :breeze.linalg.DenseVector[Double]) => { + val vbr = kernel.context.broadcast(v) + v.mapPairs((i, _) => { + //Get row number i of kernel + val row = kernel.filter((point) => i == point._1._1) + //multiply with v + var sum = kernel.context.accumulator(0.00, "Multiplication product, vector") + row.foreach((rownum) => { + sum += rownum._2*vbr.value(rownum._1._2) + }) + sum.value + }) + } + + /** + * Builds an approximate nonlinear feature map + * which corresponds to an SVM Kernel. This is + * done using the Nystrom method i.e. approximating + * the eigenvalues and eigenvectors of the Kernel + * matrix of a given RDD + * + * @param dimensions The effective number of dimensions + * to be calculated in the feature map + * + * @return An RDD containing the non linear feature map + * of all the data points passed to the function. + * + * */ + def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] = { + + + logInfo("Eigenvalue decomposition of the kernel matrix using ARPACK.") + val decomposition = EigenValueDecomposition + .symmetricEigs( + multiplyKernelMatrixOn, + dimension.toInt, dimensions, + 0.0001, 300) + + logInfo("Applying Nystrom formula to calculate feature map of kernel matrix") + + //TODO: Comment here + + val rows = kernel.groupBy((couple) => { + couple._1._1 + }) + + val temp = labels.join(rows) + + temp.map((datapoint) => { + val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i => + val eigenvector = decomposition._2(::, i) + val eigenvalue = decomposition._1(i) + var acc = 0.0 + datapoint._2._2.foreach((p) => + acc += (p._2 * eigenvector(p._1._2)/Math.sqrt(eigenvalue)) + ) + acc + } + new LabeledPoint(datapoint._2._1, Vectors.fromBreeze(y)) + }) + + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala new file mode 100644 index 0000000000000..8be61ee158f73 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.kernels + +import org.apache.spark.mllib.classification.SVMSuite +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.scalatest.FunSuite + +class KernelSuite extends FunSuite with MLlibTestSparkContext { + test("Testing evaluate function of Polynomial and RBF Functions"){ + + val nPoints = 100 + + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 + val B = -1.5 + val C = 1.0 + + val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42) + + val testRDD = sc.parallelize(testData) + + val rbf = new RBFKernel(1.00) + val poly = new PolynomialKernel(2, 1.5) + + val mappedData = SVMKernel.indexedRDD(testRDD) + + val kernelMatrix1 = poly.buildKernelMatrixasRDD(mappedData, nPoints) + val kernelMatrix2 = rbf.buildKernelMatrixasRDD(mappedData, nPoints) + + assert(mappedData.count() == nPoints) + assert(kernelMatrix1.getKernelMatrix().filter((point) => + point._2.isNaN || point._2.isInfinite) + .count() == 0) + assert(kernelMatrix2.getKernelMatrix().filter((point) => + point._2.isNaN || point._2.isInfinite) + .count() == 0) + + } + + test("Testing building of feature map from the kernel matrix"){ + val nPoints = 100 + + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 + val B = -1.5 + val C = 1.0 + + val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42) + + val testRDD = sc.parallelize(testData, 2) + testRDD.cache() + + val rbf = new RBFKernel(1.00) + val poly = new PolynomialKernel(5, 1.5) + val mappedData = SVMKernel.indexedRDD(testRDD) + + mappedData.cache() + val kernelMatrixpoly = poly.buildKernelMatrixasRDD(mappedData, nPoints) + val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(mappedData, nPoints) + + assert(mappedData.count() == nPoints) + val mappedFeaturespoly = kernelMatrixpoly.buildFeatureMap(3) + val mappedFeaturesrbf = kernelMatrixRBF.buildFeatureMap(5) + + assert(mappedFeaturespoly.filter((point) => point.features.size == 3).count() == 100) + assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100) + + } +} From bc0209b1b1d0e72e1335bb349ec04dba2a306864 Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Wed, 21 Jan 2015 18:34:29 +0100 Subject: [PATCH 08/14] 1) Changes to class hierarchy 2) Commenting Nystrom approximation code --- .../spark/mllib/kernels/DensityKernel.scala | 9 +- .../mllib/kernels/GaussianDensityKernel.scala | 82 +++++++++++++++++++ .../spark/mllib/kernels/RBFKernel.scala | 2 +- .../spark/mllib/kernels/SVMKernel.scala | 19 ++++- 4 files changed, 102 insertions(+), 10 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala index abab0778f588f..baec10ceceeaa 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala @@ -24,14 +24,9 @@ import org.apache.spark.rdd.RDD * implement various Multivariate Density * Kernels. */ -abstract class DensityKernel extends Kernel with Serializable{ - protected var bandwidth: Vector +trait DensityKernel extends Kernel with Serializable{ - protected def setBandwidth(b: Vector): Unit = { - this.bandwidth = b - } - - protected def eval(x: Vector):Double + def eval(x: Vector):Double override def evaluate(x: Vector, y: Vector): Double = this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze))) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala new file mode 100644 index 0000000000000..e133ed69d78bd --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.kernels + +import breeze.linalg.{norm, DenseVector} +import org.apache.spark.Logging +import org.apache.spark.mllib.linalg +import org.apache.spark.mllib.linalg.{Vectors, Vector} +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD + + +class GaussianDensityKernel(protected var bandwidth: Vector) + extends DensityKernel with Logging { + + def setBandwidth(b: linalg.Vector): Unit = { + this.bandwidth = b + } + + override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth) + + private def evalWithBandwidth(x: Vector, b: Vector): Double = { + val exp = scala.math.exp _ + val pow = scala.math.pow _ _ + val sqrt = scala.math.sqrt _ + val Pi = scala.math.Pi + + val buff = x.toBreeze + + val normalizedbuff: breeze.linalg.DenseVector[Double] = DenseVector.tabulate( + b.size)( + (i) => buff(i)/b.apply(i) + ) + exp(-1*pow(norm(normalizedbuff), 2)/2)/sqrt(Pi * 2) + } + + //TODO: Implement derivative function + private def derivative(n: Int)(x: Vector): Vector = { + Vectors.zeros(x.size) + } + + //TODO: Implement R integral + private def R(r: Int, pilot: Vector): Vector = { + Vectors.zeros(pilot.size) + } + + //TODO: Implement mu integral + private val mu: Vector = Vectors.zeros(this.bandwidth.size) + + override def optimalBandwidth(data: RDD[Vector]): Unit = { + + //First calculate variance of all dimensions + val columnStats = Statistics.colStats(data) + + val colvariance = columnStats.variance + + //Now calculate the initial estimates of R(f'''') and R(f'''''') + + //Use the earlier result to calculate h1 and h2 bandwidths for each + //dimension separately + + //Use the Sheathon and Jones 1991 result to calculate + //the optimal bandwidth + + //Vectors.fromBreeze(breeze.linalg.DenseVector.ones[Double](10)) + } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala index e6ff82033f493..fac11439fc192 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala @@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD */ class RBFKernel(private var bandwidth: Double) - extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{ + extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable { def setBandwidth(d: Double): Unit = { this.bandwidth = d diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala index c5b70fc35ef37..ff4d0d054e6bb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -155,14 +155,29 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], logInfo("Applying Nystrom formula to calculate feature map of kernel matrix") - //TODO: Comment here - + /* + * Get row number i of the + * Kernel Matrix + * */ val rows = kernel.groupBy((couple) => { couple._1._1 }) + /* + * Join the each row i with the + * target label for point i. + * */ val temp = labels.join(rows) + /* + * Now for each data point, + * calculate n dimensions of the + * feature map where n is the number + * of eigenvalues/vectors obtained from + * the Eigen Decomposition. + * + * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, n, K(k, x)*eigenvector(i)(k)) + * */ temp.map((datapoint) => { val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i => val eigenvector = decomposition._2(::, i) From f8baf1e7da0ab6d71898f150d6b01cf3f38b2baf Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Tue, 27 Jan 2015 17:03:26 +0100 Subject: [PATCH 09/14] Added optimal bandwidth selection procedure for Gaussian Density Kernel --- .../spark/mllib/kernels/DensityKernel.scala | 11 +- .../mllib/kernels/GaussianDensityKernel.scala | 181 +++++++++++++++--- .../spark/mllib/kernels/KernelEstimator.scala | 41 ++++ .../mllib/kernels/PolynomialKernel.scala | 6 +- .../spark/mllib/kernels/RBFKernel.scala | 6 +- .../spark/mllib/kernels/SVMKernel.scala | 25 ++- .../spark/mllib/kernels/KernelSuite.scala | 18 ++ 7 files changed, 232 insertions(+), 56 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala index baec10ceceeaa..6658c5343ace3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala @@ -17,6 +17,7 @@ package org.apache.spark.mllib.kernels import org.apache.spark.mllib.linalg.{Vectors, Vector} +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD /** @@ -24,17 +25,15 @@ import org.apache.spark.rdd.RDD * implement various Multivariate Density * Kernels. */ -trait DensityKernel extends Kernel with Serializable{ +trait DensityKernel extends Kernel with Serializable { def eval(x: Vector):Double override def evaluate(x: Vector, y: Vector): Double = this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze))) - /** - * Calculates the AMISE (Asymptotic Mean Integrated Square Error) - * optimal bandwidth assignment by 'solve the equation plug in method' - **/ - protected def optimalBandwidth(data: RDD[Vector]): Vector + protected def derivative(n: Int, x: Double): Double + protected val mu: Double + protected val r: Double } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala index e133ed69d78bd..6c7621e11a208 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala @@ -23,60 +23,179 @@ import org.apache.spark.mllib.linalg import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD +import breeze.numerics.{sqrt => brsqrt} -class GaussianDensityKernel(protected var bandwidth: Vector) - extends DensityKernel with Logging { +class GaussianDensityKernel + extends DensityKernel with KernelEstimator with Logging with Serializable { + private val exp = scala.math.exp _ + private val pow = scala.math.pow _ + private val sqrt = scala.math.sqrt _ + private val Pi = scala.math.Pi + protected var bandwidth: Vector = Vectors.zeros(10) - def setBandwidth(b: linalg.Vector): Unit = { - this.bandwidth = b - } - - override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth) + private def evalForDimension(x: Double, pilot: Double): Double = + exp(-1*pow(x/pilot, 2)/2)/sqrt(Pi * 2) private def evalWithBandwidth(x: Vector, b: Vector): Double = { - val exp = scala.math.exp _ - val pow = scala.math.pow _ _ - val sqrt = scala.math.sqrt _ - val Pi = scala.math.Pi - + assert(x.size == b.size, + "Dimensions of vector x and the bandwidth of the kernel must match") val buff = x.toBreeze - + val bw = b.toBreeze val normalizedbuff: breeze.linalg.DenseVector[Double] = DenseVector.tabulate( - b.size)( - (i) => buff(i)/b.apply(i) + bw.size)( + (i) => buff(i)/bw(i) ) - exp(-1*pow(norm(normalizedbuff), 2)/2)/sqrt(Pi * 2) + exp(-1*pow(norm(normalizedbuff), 2)/2)/pow(sqrt(Pi * 2), b.size) } - //TODO: Implement derivative function - private def derivative(n: Int)(x: Vector): Vector = { - Vectors.zeros(x.size) + /* + * Calculate the value of the hermite polynomials + * tail recursively. This is needed to calculate + * the Gaussian derivatives at a point x. + * */ + private def hermite(n: Int, x: Double): Double = { + def hermiteHelper(k: Int, x: Double, a: Double, b: Double): Double = + k match { + case 0 => a + case 1 => b + case _ => hermiteHelper(k-1, x, b, x*b - (k-1)*a) + } + hermiteHelper(n, x, 1, x) } - //TODO: Implement R integral - private def R(r: Int, pilot: Vector): Vector = { - Vectors.zeros(pilot.size) + def setBandwidth(b: linalg.Vector): Unit = { + this.bandwidth = b } - //TODO: Implement mu integral - private val mu: Vector = Vectors.zeros(this.bandwidth.size) + override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth) + + /** + * Calculates the derivative at point x for the Gaussian + * Density Kernel, for only one dimension. + * + * @param n The number of times the gaussian has to be differentiated + * @param x The point x at which the derivative has to evaluated + * @return The value of the nth derivative at the point x + * */ + override def derivative(n: Int, x: Double): Double = { + (1/sqrt(2*Pi))*(1/pow(-1.0,n))*exp(-1*pow(x,2)/2)*hermite(n, x) + } + + /** + * Implementation of the estimator for the R integral + * for a multivariate Gaussian Density Kernel. + * Evaluates R(D_r(f(x))). + * + * @param r the degree of the derivative of the kernel + * + * @param N The size of the original data set from which + * kernel matrix [[RDD]] was constructed. + * + * @param pilot The pilot bandwidth to be used to calculate + * the kernel values. (Note that we have not calculated + * the AMISE bandwidth yet and we use this estimator + * as a means to get the AMISE bandwidth) + * + * @param kernel The RDD containing the kernel matrix + * consisting of pairs Xi - Xj, where Xi and Xj + * are drawn from the original data set. + * + * @return R the estimated value of the integral of the square + * of the rth derivative of the kernel over the Real domain. + * */ + override protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double], + kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = { + + + /* + * Apply map to get values of the derivative of the kernel + * at various point pairs. + * */ + val kernelNormalized = kernel.map((couple) => + (couple._1, Vectors.fromBreeze(DenseVector.tabulate(pilot.size) + ((i) => (1/(pow(N, 2)*pow(pilot(i), r + 1)))* + this.derivative(r, couple._2.toBreeze(i)/pilot(i))) + ))) + + /* + * Sum up all the individual values to get the estimated + * value of the integral + * */ + val integralvalue = kernelNormalized.reduce((a,b) => + ((0,0), Vectors.fromBreeze(a._2.toBreeze + b._2.toBreeze))) + + integralvalue._2.toBreeze + } + + override protected val mu = (1/4)*(1/sqrt(Pi)) + override protected val r = (1/2)*(1/sqrt(Pi)) + + /** + * Use the Sheather and Jones plug-in + * method to calculate the optimal bandwidth + * http://bit.ly/1EoBY7q + * + * */ override def optimalBandwidth(data: RDD[Vector]): Unit = { + val dataSize: Long = data.count() //First calculate variance of all dimensions val columnStats = Statistics.colStats(data) + // And then the standard deviation + val colvar = columnStats.variance.toBreeze + val colstd = colvar.map((v) => sqrt(v)) + + //Now calculate the initial estimates of R(f^6) and R(f^8) + + /*val Rf6: DenseVector[Double] = DenseVector.tabulate(colstd.size)( + (i) => -15.0*pow(colstd(i), -7.0)/(16*sqrt(Pi)))*/ + + val Rf8: DenseVector[Double] = DenseVector.tabulate(colstd.size)( + (i) => 105*pow(colstd(i), -9.0)/(32*sqrt(Pi))) + + /* + * Use the earlier result to calculate + * h1 and h2 bandwidths for each dimension + * */ + + /*val h1: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) => + pow(-2*this.derivative(4, 0.0)/(dataSize*this.mu*Rf6(i)), 1/7))*/ + val h2: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) => + pow(-2*this.derivative(6, 0.0)/(dataSize*this.mu*Rf8(i)), 1/9)) + + + /* + * Use h1 and h2 to calculate more + * refined estimates of R(f^6) and R(f^8) + * */ + + //Get an 0-indexed version of the original data set + val mappedData = SVMKernel.indexedRDD(data) + + /* + * Apply cartesian product on the indexed data set + * and then map it to a RDD of type [(i,j), Xi - Xj] + * */ + val kernel = mappedData.cartesian(mappedData) + .map((prod) => ((prod._1._1, prod._2._1), + Vectors.fromBreeze(prod._1._2.toBreeze - + prod._2._2.toBreeze)) + ) + kernel.cache() + - val colvariance = columnStats.variance + val newRf6: breeze.linalg.Vector[Double] = this.R(8, dataSize, h2, kernel) - //Now calculate the initial estimates of R(f'''') and R(f'''''') + val hAMSE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) => + pow((-2*this.derivative(4, 0.0))/(dataSize*this.mu*newRf6(i)), 1/7)) - //Use the earlier result to calculate h1 and h2 bandwidths for each - //dimension separately + val newRf4: breeze.linalg.Vector[Double] = this.R(4, dataSize, hAMSE, kernel) - //Use the Sheathon and Jones 1991 result to calculate - //the optimal bandwidth + val hAMISE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) => + pow(this.r/(dataSize*this.mu*this.mu*newRf4(i)), 1/5)) - //Vectors.fromBreeze(breeze.linalg.DenseVector.ones[Double](10)) + this.bandwidth = Vectors.fromBreeze(hAMISE) } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala new file mode 100644 index 0000000000000..dedbd4c3a6264 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.kernels + +import org.apache.spark.Logging +import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector} +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD + +/** + * Trait defining the basic behavior + * of a Kernel density estimator + */ +trait KernelEstimator extends Logging { + + protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double], + kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] + + + /** + * Calculate the AMISE (Asymptotic Mean Integrated Square Error) + * optimal bandwidth assignment by 'solve the equation plug in method' + **/ + def optimalBandwidth(data: RDD[Vector]): Unit + +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala index cc449477ca173..51abfad97c060 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala @@ -27,7 +27,7 @@ import org.apache.spark.rdd.RDD */ class PolynomialKernel(private var degree: Int, private var offset: Double) - extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{ + extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable{ def setDegree(d: Int): Unit = { this.degree = d @@ -40,8 +40,8 @@ class PolynomialKernel(private var degree: Int, override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree) - override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], length: Long): - KernelMatrix[RDD[((Int, Int), Double)]] = + KernelMatrix[RDD[((Long, Long), Double)]] = SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala index fac11439fc192..d5c9285e8c394 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala @@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD */ class RBFKernel(private var bandwidth: Double) - extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable { + extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable { def setBandwidth(d: Double): Unit = { this.bandwidth = d @@ -40,9 +40,9 @@ class RBFKernel(private var bandwidth: Double) Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2))) } - override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], length: Long): - KernelMatrix[RDD[((Int, Int), Double)]] = + KernelMatrix[RDD[((Long, Long), Double)]] = SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala index ff4d0d054e6bb..74bec1050f913 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -16,8 +16,7 @@ */ package org.apache.spark.mllib.kernels -import breeze.linalg.{DenseVector} -import org.apache.spark.annotation.DeveloperApi +import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg import org.apache.spark.{SparkContext, Logging} import org.apache.spark.mllib.linalg._ @@ -29,7 +28,7 @@ import org.apache.spark.rdd.RDD */ abstract class SVMKernel[T] extends Kernel with Logging with Serializable { - def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)], + def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], length: Long): KernelMatrix[T] } @@ -50,9 +49,9 @@ object SVMKernel extends Logging with Serializable{ * @return An (Int, LabeledPoint) Key-Value RDD indexed * from 0 to data.count() - 1 * */ - def indexedRDD(data: RDD[LabeledPoint]): RDD[(Int, LabeledPoint)] = { + def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = { val sc = data.context - val i = sc.accumulator(-1, "Raw Data Index") + val i: org.apache.spark.Accumulator[Long] = sc.accumulator(-1, "Raw Data Index") data.map((point) => { i+=1 @@ -72,10 +71,10 @@ object SVMKernel extends Logging with Serializable{ * @return An [[SVMKernelMatrix]] object. * * */ - def buildSVMKernelMatrix(mappedData: RDD[(Int, LabeledPoint)], + def buildSVMKernelMatrix(mappedData: RDD[(Long, LabeledPoint)], length: Long, eval: (linalg.Vector, linalg.Vector) => Double): - KernelMatrix[RDD[((Int, Int), Double)]] = { + KernelMatrix[RDD[((Long, Long), Double)]] = { logInfo("Constructing key-value representation of kernel matrix.") logInfo("Dimension: " + length + " x " + length) @@ -100,12 +99,12 @@ trait KernelMatrix[T] extends Serializable{ def getKernelMatrix(): T = this.kernel } -class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], +class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)], private val dimension: Long, - private val labels: RDD[(Int, Double)]) - extends KernelMatrix[RDD[((Int, Int), Double)]] with Logging with Serializable { + private val labels: RDD[(Long, Double)]) + extends KernelMatrix[RDD[((Long, Long), Double)]] with Logging with Serializable { - override def getKernelMatrix():RDD[((Int, Int), Double)] = this.kernel + override def getKernelMatrix():RDD[((Long, Long), Double)] = this.kernel /** * Defines a function value which @@ -123,7 +122,7 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], //multiply with v var sum = kernel.context.accumulator(0.00, "Multiplication product, vector") row.foreach((rownum) => { - sum += rownum._2*vbr.value(rownum._1._2) + sum += rownum._2*vbr.value(rownum._1._2.toInt) }) sum.value }) @@ -184,7 +183,7 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)], val eigenvalue = decomposition._1(i) var acc = 0.0 datapoint._2._2.foreach((p) => - acc += (p._2 * eigenvector(p._1._2)/Math.sqrt(eigenvalue)) + acc += (p._2 * eigenvector(p._1._2.toInt)/Math.sqrt(eigenvalue)) ) acc } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala index 8be61ee158f73..6ac6ae9b33e18 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -81,4 +81,22 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100) } + + test("Testing optimal bandwidth calculation on Gaussian Kernel"){ + val nPoints = 100 + + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 + val B = -1.5 + val C = 1.0 + + val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42) + + val testRDD = sc.parallelize(testData, 2) + val newtestRDD = testRDD.map((p) => p.features) + newtestRDD.cache() + val kern = new GaussianDensityKernel() + kern.optimalBandwidth(newtestRDD) + assert(kern.eval(newtestRDD.first()) != Double.NaN) + } } From 1fedafdbe7794dcd9a34f6a7317b1ff5e936c1f1 Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Thu, 29 Jan 2015 14:42:53 +0100 Subject: [PATCH 10/14] Entropy based subset selection done, unit tests passing --- .../spark/mllib/kernels/SVMKernel.scala | 20 ++- .../mllib/prototype/EntropyMeasure.scala | 49 +++++++ .../mllib/prototype/EntropySelector.scala | 127 ++++++++++++++++++ .../spark/mllib/prototype/Measure.scala | 28 ++++ .../prototype/QuadraticRenyiEntropy.scala | 60 +++++++++ .../mllib/prototype/SubsetSelector.scala | 28 ++++ .../spark/mllib/kernels/KernelSuite.scala | 20 ++- 7 files changed, 319 insertions(+), 13 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala index 74bec1050f913..5321e55a07c70 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -39,7 +39,7 @@ abstract class SVMKernel[T] extends Kernel with Logging with Serializable { * while working with [[RDD]] of [[LabeledPoint]] * * */ -object SVMKernel extends Logging with Serializable{ +object SVMKernel extends Logging with Serializable { /** * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]] @@ -49,15 +49,7 @@ object SVMKernel extends Logging with Serializable{ * @return An (Int, LabeledPoint) Key-Value RDD indexed * from 0 to data.count() - 1 * */ - def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = { - val sc = data.context - val i: org.apache.spark.Accumulator[Long] = sc.accumulator(-1, "Raw Data Index") - - data.map((point) => { - i+=1 - (i.localValue, point) - }) - } + def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = data.zipWithIndex().map((p) => (p._2, p._1)) /** @@ -87,13 +79,19 @@ object SVMKernel extends Logging with Serializable{ new SVMKernelMatrix(kernel, length, labels) } + def zipVectorsWithLabels(mappedData: RDD[(Long, Vector)], + labels: RDD[(Long, Double)]): + RDD[LabeledPoint] = mappedData.join(labels).map((point) => + new LabeledPoint(point._2._2, point._2._1)) + + } /** * Defines a trait which outlines the basic * functionality of Kernel Matrices. * */ -trait KernelMatrix[T] extends Serializable{ +trait KernelMatrix[T] extends Serializable { protected val kernel: T def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] def getKernelMatrix(): T = this.kernel diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala new file mode 100644 index 0000000000000..73bcfa3aab30e --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.prototype + +import org.apache.spark.mllib.kernels.DensityKernel +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.linalg.Vector + +/** + * Models a general entropy measure. + * Any entropy measure would require a + * probability distribution + */ +abstract class EntropyMeasure extends Measure[Vector] +with Serializable{ + + protected val density: DensityKernel + + /** + * Given a probability distribution for + * the data set, calculate the entropy of + * the data set with respect to the given + * distribution. + * + * @param data The data set whose entropy is + * required. + * + * @return The entropy of the data set. + * */ + + def entropy[K](data: RDD[(K, Vector)]): Double + + override def evaluate[K](data: RDD[(K, Vector)]): Double = this.entropy(data) +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala new file mode 100644 index 0000000000000..1543919c1fe53 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.prototype + +import org.apache.spark.{SparkContext, Logging} +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.rdd.RDD + +/** + * Basic skeleton of an entropy based + * subset selector + */ +abstract class EntropySelector + extends SubsetSelector[(Long, Vector)] with Serializable + with Logging { + protected val measure: EntropyMeasure + protected val delta: Double + protected val MAX_ITERATIONS: Int +} + +class GreedyEntropySelector(m: EntropyMeasure, + del: Double = 0.0001, + max: Int = 5000) + extends EntropySelector with Serializable + with Logging { + + override protected val measure: EntropyMeasure = m + override protected val delta: Double = del + override protected val MAX_ITERATIONS: Int = max + + override def selectPrototypes(data: RDD[(Long, Vector)], + M: Int): RDD[(Long, Vector)] = { + + val context = data.context + + /* + * Draw an initial sample of M points + * from data without replacement. + * + * Define a working set which we + * will use as a prototype set to + * to each iteration + * */ + + val workingset = data.keys.takeSample(false, M) + + val r = scala.util.Random + var it: Int = 0 + + //All the elements not in the working set + var newDataset: RDD[Long] = data.keys.filter((p) => !workingset.contains(p)) + //Existing best value of the entropy + var oldEntropy: Double = this.measure.evaluate(data.filter((point) => + workingset.contains(point._1))) + //Store the value of entropy after an element swap + var newEntropy: Double = 0.0 + var d: Double = Double.NegativeInfinity + var rand: Int = 0 + do { + /* + * Randomly select a point from + * the working set as well as data + * and then swap them. + * */ + rand = r.nextInt(workingset.length - 1) + val point1 = workingset.apply(rand) + + val point2 = newDataset.takeSample(false, 1).apply(0) + + //Update the working set + workingset(rand) = point2 + //Calculate the new entropy + newEntropy = this.measure.evaluate(data.filter((p) => + workingset.contains(p._1))) + + /* + * Calculate the change in entropy, + * if it has improved then keep the + * swap, otherwise revert to existing + * working set. + * */ + d = newEntropy - oldEntropy + + if(d < 0) { + /* + * Improvement in entropy so + * keep the updated working set + * as it is and update the + * variable 'newDataset' + * */ + oldEntropy = newEntropy + newDataset = data.keys.filter((p) => !workingset.contains(p)) + } else { + /* + * No improvement in entropy + * so revert the working set + * to its initial state. Leave + * the variable newDataset as + * it is. + * */ + workingset(rand) = point1 + } + + it += 1 + } while(math.abs(d) >= this.delta && + it <= this.MAX_ITERATIONS) + + //Time to return the final working set + data.filter((p) => workingset.contains(p._1)) + } + +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala new file mode 100644 index 0000000000000..80d466fb18ee3 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.prototype + +import org.apache.spark.rdd.RDD + +/** + * Trait which outlines basic behavior + * of a subset utility measure. + */ +trait Measure[T] { + def evaluate[K](data: RDD[(K, T)]): Double +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala new file mode 100644 index 0000000000000..d2fcbaef381e8 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.prototype + +import breeze.linalg.DenseVector +import org.apache.spark.Logging +import org.apache.spark.mllib.kernels.DensityKernel +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.linalg.{Vectors, Vector} + +/** + * Implements the quadratic Renyi Entropy + */ +class QuadraticRenyiEntropy(dist: DensityKernel) + extends EntropyMeasure with Serializable with Logging { + + val log_e = scala.math.log _ + val sqrt = scala.math.sqrt _ + + override protected val density: DensityKernel = dist + + /** + * Calculate the quadratic Renyi entropy + * within a distribution specific + * proportionality constant. This can + * be used to compare the entropy values of + * different sets of data on the same + * distribution. + * + * @param data The data set whose entropy is + * required. + * @return The entropy of the dataset assuming + * it is distributed as given by the value + * parameter 'density'. + * */ + + override def entropy[K](data: RDD[(K, Vector)]): Double = { + val dim = data.first()._2.size + val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2)) + -1*log_e(data.cartesian(data).map((couple) => + density.evaluate( + Vectors.fromBreeze(couple._1._2.toBreeze :/ root_two), + Vectors.fromBreeze(couple._2._2.toBreeze :/ root_two) + )).reduce((a,b) => a + b)) + } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala new file mode 100644 index 0000000000000..c96bcb0dd3a3e --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.prototype + +import org.apache.spark.rdd.RDD + +/** + * Defines the characteristics of + * a subset selector + */ +trait SubsetSelector[T] extends Serializable{ + def selectPrototypes(data: RDD[T], M: Int): RDD[T] +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala index 6ac6ae9b33e18..26f163ada25c2 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -16,7 +16,9 @@ */ package org.apache.spark.mllib.kernels +import breeze.linalg.norm import org.apache.spark.mllib.classification.SVMSuite +import org.apache.spark.mllib.prototype.{QuadraticRenyiEntropy, GreedyEntropySelector} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.scalatest.FunSuite @@ -82,8 +84,9 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { } - test("Testing optimal bandwidth calculation on Gaussian Kernel"){ - val nPoints = 100 + test("Testing optimal bandwidth calculation on Gaussian Kernel" + + " and maximum entropy subset selection"){ + val nPoints = 10000 // NOTE: Intercept should be small for generating equal 0s and 1s val A = 0.01 @@ -98,5 +101,18 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { val kern = new GaussianDensityKernel() kern.optimalBandwidth(newtestRDD) assert(kern.eval(newtestRDD.first()) != Double.NaN) + + val newIndexedRDD = SVMKernel.indexedRDD(newtestRDD) + newIndexedRDD.cache() + newtestRDD.unpersist() + + val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern) + val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) + + val subsetRDD = subsetsel.selectPrototypes( + newIndexedRDD, + 100) + + assert(subsetRDD.count() == 100) } } From 6b8d8db8466b8795f8710d8861fc8f1ed541543f Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Sun, 1 Feb 2015 00:26:03 +0100 Subject: [PATCH 11/14] 1) Optimized code for feature map extraction, kernel matrix multiplication 2) Code indentation changes --- .../spark/mllib/kernels/DensityKernel.scala | 11 +- .../mllib/kernels/GaussianDensityKernel.scala | 25 +-- .../apache/spark/mllib/kernels/Kernel.scala | 4 +- .../spark/mllib/kernels/KernelEstimator.scala | 9 +- .../mllib/kernels/PolynomialKernel.scala | 20 +- .../spark/mllib/kernels/RBFKernel.scala | 18 +- .../spark/mllib/kernels/SVMKernel.scala | 205 ++++++++++-------- .../mllib/prototype/EntropyMeasure.scala | 9 +- .../mllib/prototype/EntropySelector.scala | 31 +-- .../prototype/QuadraticRenyiEntropy.scala | 18 +- .../spark/mllib/kernels/KernelSuite.scala | 80 ++++++- 11 files changed, 258 insertions(+), 172 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala index 6658c5343ace3..7f8b7a06af7cc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala @@ -18,7 +18,6 @@ package org.apache.spark.mllib.kernels import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.rdd.RDD /** * Abstract class which can be extended to @@ -26,14 +25,14 @@ import org.apache.spark.rdd.RDD * Kernels. */ trait DensityKernel extends Kernel with Serializable { + protected val mu: Double + protected val r: Double def eval(x: Vector):Double - override def evaluate(x: Vector, y: Vector): Double = - this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze))) + override def evaluate(x: LabeledPoint, y: LabeledPoint): Double = + this.eval(Vectors.fromBreeze(x.features.toBreeze.-=(y.features.toBreeze))) protected def derivative(n: Int, x: Double): Double - protected val mu: Double - protected val r: Double -} + } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala index 6c7621e11a208..6de1c51c89df4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala @@ -19,20 +19,22 @@ package org.apache.spark.mllib.kernels import breeze.linalg.{norm, DenseVector} import org.apache.spark.Logging -import org.apache.spark.mllib.linalg import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD -import breeze.numerics.{sqrt => brsqrt} - class GaussianDensityKernel - extends DensityKernel with KernelEstimator with Logging with Serializable { + extends DensityKernel + with KernelEstimator + with Logging + with Serializable { private val exp = scala.math.exp _ private val pow = scala.math.pow _ private val sqrt = scala.math.sqrt _ private val Pi = scala.math.Pi protected var bandwidth: Vector = Vectors.zeros(10) + override protected val mu = (1/4)*(1/sqrt(Pi)) + override protected val r = (1/2)*(1/sqrt(Pi)) private def evalForDimension(x: Double, pilot: Double): Double = exp(-1*pow(x/pilot, 2)/2)/sqrt(Pi * 2) @@ -64,12 +66,11 @@ class GaussianDensityKernel hermiteHelper(n, x, 1, x) } - def setBandwidth(b: linalg.Vector): Unit = { + def setBandwidth(b: Vector): Unit = { this.bandwidth = b } - override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth) - + override def eval(x: Vector) = evalWithBandwidth(x, this.bandwidth) /** * Calculates the derivative at point x for the Gaussian @@ -105,9 +106,9 @@ class GaussianDensityKernel * @return R the estimated value of the integral of the square * of the rth derivative of the kernel over the Real domain. * */ - override protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double], - kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = { - + override protected def R( + r: Int, N: Long, pilot: breeze.linalg.Vector[Double], + kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = { /* * Apply map to get values of the derivative of the kernel @@ -129,9 +130,6 @@ class GaussianDensityKernel integralvalue._2.toBreeze } - override protected val mu = (1/4)*(1/sqrt(Pi)) - override protected val r = (1/2)*(1/sqrt(Pi)) - /** * Use the Sheather and Jones plug-in * method to calculate the optimal bandwidth @@ -185,7 +183,6 @@ class GaussianDensityKernel ) kernel.cache() - val newRf6: breeze.linalg.Vector[Double] = this.R(8, dataSize, h2, kernel) val hAMSE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) => diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala index 4730bf5dc5854..3d945fa6e22b5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala @@ -17,6 +17,8 @@ package org.apache.spark.mllib.kernels import org.apache.spark.mllib.linalg._ +import org.apache.spark.mllib.regression.LabeledPoint + /** * Declares a trait Kernel which would serve * as a base trait for all classes implementing @@ -36,5 +38,5 @@ trait Kernel { * @return the value of the Kernel function. * * */ - def evaluate(x: Vector, y:Vector): Double + def evaluate(x: LabeledPoint, y: LabeledPoint): Double } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala index dedbd4c3a6264..03cc504bc34c3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala @@ -18,8 +18,7 @@ package org.apache.spark.mllib.kernels import org.apache.spark.Logging -import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector} -import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD /** @@ -28,9 +27,9 @@ import org.apache.spark.rdd.RDD */ trait KernelEstimator extends Logging { - protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double], - kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] - + protected def R( + r: Int, N: Long, pilot: breeze.linalg.Vector[Double], + kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] /** * Calculate the AMISE (Asymptotic Mean Integrated Square Error) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala index 51abfad97c060..828aca0b48570 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala @@ -17,7 +17,6 @@ package org.apache.spark.mllib.kernels import org.apache.spark.Logging -import org.apache.spark.mllib.linalg import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -25,9 +24,12 @@ import org.apache.spark.rdd.RDD * Standard Polynomial SVM Kernel * of the form K(Xi,Xj) = (Xi^T * Xj + d)^r */ -class PolynomialKernel(private var degree: Int, - private var offset: Double) - extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable{ +class PolynomialKernel( + private var degree: Int, + private var offset: Double) + extends SVMKernel[RDD[((Long, Long), Double)]] + with Logging + with Serializable{ def setDegree(d: Int): Unit = { this.degree = d @@ -37,11 +39,11 @@ class PolynomialKernel(private var degree: Int, this.offset = o } - override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = - Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree) + override def evaluate(x: LabeledPoint, y: LabeledPoint): Double = + Math.pow(x.features.toBreeze dot y.features.toBreeze + this.offset, this.degree) - override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], - length: Long): - KernelMatrix[RDD[((Long, Long), Double)]] = + override def buildKernelMatrixasRDD( + mappedData: RDD[(Long, LabeledPoint)], + length: Long): KernelMatrix[RDD[((Long, Long), Double)]] = SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala index d5c9285e8c394..3b78b159d43b1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala @@ -16,10 +16,8 @@ */ package org.apache.spark.mllib.kernels -import breeze.linalg.{DenseVector, norm} import org.apache.spark.Logging -import org.apache.spark.mllib.linalg -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -29,20 +27,22 @@ import org.apache.spark.rdd.RDD */ class RBFKernel(private var bandwidth: Double) - extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable { + extends SVMKernel[RDD[((Long, Long), Double)]] + with Logging + with Serializable { def setBandwidth(d: Double): Unit = { this.bandwidth = d } - override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = { - val diff: linalg.Vector = Vectors.fromBreeze(x.toBreeze - y.toBreeze) + override def evaluate(x: LabeledPoint, y: LabeledPoint): Double = { + val diff: Vector = Vectors.fromBreeze(x.features.toBreeze - y.features.toBreeze) Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2))) } - override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], - length: Long): - KernelMatrix[RDD[((Long, Long), Double)]] = + override def buildKernelMatrixasRDD( + mappedData: RDD[(Long, LabeledPoint)], + length: Long): KernelMatrix[RDD[((Long, Long), Double)]] = SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala index 5321e55a07c70..a4a11dc53e2d1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -16,9 +16,8 @@ */ package org.apache.spark.mllib.kernels -import breeze.linalg.DenseVector -import org.apache.spark.mllib.linalg -import org.apache.spark.{SparkContext, Logging} +import breeze.linalg.{DenseVector, DenseMatrix} +import org.apache.spark.Logging import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @@ -28,9 +27,67 @@ import org.apache.spark.rdd.RDD */ abstract class SVMKernel[T] extends Kernel with Logging with Serializable { - def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)], - length: Long): KernelMatrix[T] + /** + * Build the kernel matrix of the prototype vectors + * + * @param mappedData The prototype vectors/points + * + * @param length The number of points + * + * @return A [[KernelMatrix]] object + * + * + * */ + def buildKernelMatrixasRDD( + mappedData: RDD[(Long, LabeledPoint)], + length: Long): KernelMatrix[T] + + /** + * Builds an approximate nonlinear feature map + * which corresponds to an SVM Kernel. This is + * done using the Nystrom method i.e. approximating + * the eigenvalues and eigenvectors of the Kernel + * matrix of a given RDD + * + * For each data point, + * calculate m dimensions of the + * feature map where m is the number + * of eigenvalues/vectors obtained from + * the Eigen Decomposition. + * + * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, m, K(k, x)*eigenvector(i)(k)) + * + * @param decomposition The Eigenvalue decomposition calculated + * from the kernel matrix of the prototype + * subset. + * @param prototypes The prototype subset. + * + * @param data The dataset [[RDD]] on which the feature map + * is to be applied. + * + * */ + def featureMapping(decomposition: (DenseVector[Double], DenseMatrix[Double])) + (prototypes: RDD[(Long, LabeledPoint)]) + (data: RDD[(Long, LabeledPoint)]) + : RDD[(Long, LabeledPoint)] = { + + logInfo("Calculating the Non Linear feature map of data set") + + data.cartesian(prototypes) + .map((couple) => { + val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i => + var eigenvector = 0.0 + if (couple._2._1.toInt < decomposition._1.length) { + eigenvector = decomposition._2(couple._2._1.toInt, i) + } + val eigenvalue = decomposition._1(i) + this.evaluate(couple._1._2, couple._2._2) * eigenvector/Math.sqrt(eigenvalue) + } + (couple._1._1, (couple._1._2.label, y)) + }).reduceByKey((veca, vecb) => (veca._1, veca._2 + vecb._2)) + .map((p) => (p._1, new LabeledPoint(p._2._1, Vectors.fromBreeze(p._2._2)))) + } } /** @@ -41,6 +98,32 @@ abstract class SVMKernel[T] extends Kernel with Logging with Serializable { * */ object SVMKernel extends Logging with Serializable { + /** + * Defines a function value which + * calculates the multiplication of + * the Kernel Matrix with a Breeze + * Vector and returns the result as a + * Breeze DenseVector. + * */ + def multiplyKernelMatrixBy(kernel: RDD[((Long, Long), Double)]) + (v :breeze.linalg.DenseVector[Double]): + DenseVector[Double] = { + val vbr = kernel.context.broadcast(v) + val result: DenseVector[Double] = + DenseVector.tabulate(v.length)( + (i) => { + //Get row number i of kernel + val row = DenseVector.apply(kernel + .filter((point) => i == point._1._1) + .map((p) => p._2) + .collect()) + //dot product with v + vbr.value.t * row + } + ) + result + } + /** * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]] * @@ -49,8 +132,8 @@ object SVMKernel extends Logging with Serializable { * @return An (Int, LabeledPoint) Key-Value RDD indexed * from 0 to data.count() - 1 * */ - def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = data.zipWithIndex().map((p) => (p._2, p._1)) - + def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = + data.zipWithIndex().map((p) => (p._2, p._1)) /** * This function constructs an [[SVMKernelMatrix]] @@ -58,14 +141,15 @@ object SVMKernel extends Logging with Serializable { * @param mappedData The indexed [[RDD]] of [[LabeledPoint]] * @param length Length of the indexed [[RDD]] * @param eval A function which calculates the value of the Kernel - * given two Vectors [[linalg.Vector]]. + * given two Labeled Points [[LabeledPoint]]. * * @return An [[SVMKernelMatrix]] object. * * */ - def buildSVMKernelMatrix(mappedData: RDD[(Long, LabeledPoint)], - length: Long, - eval: (linalg.Vector, linalg.Vector) => Double): + def buildSVMKernelMatrix( + mappedData: RDD[(Long, LabeledPoint)], + length: Long, + eval: (LabeledPoint, LabeledPoint) => Double): KernelMatrix[RDD[((Long, Long), Double)]] = { logInfo("Constructing key-value representation of kernel matrix.") @@ -74,17 +158,19 @@ object SVMKernel extends Logging with Serializable { val labels = mappedData.map((p) => (p._1, p._2.label)) val kernel = mappedData.cartesian(mappedData) .map((prod) => ((prod._1._1, prod._2._1), - eval(prod._1._2.features, prod._2._2.features))) + eval(prod._1._2, prod._2._2))) kernel.cache() new SVMKernelMatrix(kernel, length, labels) } - def zipVectorsWithLabels(mappedData: RDD[(Long, Vector)], - labels: RDD[(Long, Double)]): - RDD[LabeledPoint] = mappedData.join(labels).map((point) => + def zipVectorsWithLabels( + mappedData: RDD[(Long, Vector)], + labels: RDD[(Long, Double)]): RDD[LabeledPoint] = + mappedData.join(labels).map((point) => new LabeledPoint(point._2._2, point._2._1)) - + def unzipIndexedData(mappedData: RDD[(Long, LabeledPoint)]): + RDD[LabeledPoint] = mappedData.map((p) => p._2) } /** @@ -93,38 +179,19 @@ object SVMKernel extends Logging with Serializable { * */ trait KernelMatrix[T] extends Serializable { protected val kernel: T - def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] - def getKernelMatrix(): T = this.kernel -} -class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)], - private val dimension: Long, - private val labels: RDD[(Long, Double)]) - extends KernelMatrix[RDD[((Long, Long), Double)]] with Logging with Serializable { + def eigenDecomposition(dimensions: Int): (DenseVector[Double], DenseMatrix[Double]) - override def getKernelMatrix():RDD[((Long, Long), Double)] = this.kernel + def getKernelMatrix(): T = this.kernel +} - /** - * Defines a function value which - * calculates the multiplication of - * the Kernel Matrix with a Breeze - * Vector and returns the result as a - * Breeze DenseVector. - * */ - val multiplyKernelMatrixOn = - (v :breeze.linalg.DenseVector[Double]) => { - val vbr = kernel.context.broadcast(v) - v.mapPairs((i, _) => { - //Get row number i of kernel - val row = kernel.filter((point) => i == point._1._1) - //multiply with v - var sum = kernel.context.accumulator(0.00, "Multiplication product, vector") - row.foreach((rownum) => { - sum += rownum._2*vbr.value(rownum._1._2.toInt) - }) - sum.value - }) - } +class SVMKernelMatrix( + override protected val kernel: RDD[((Long, Long), Double)], + private val dimension: Long, + private val labels: RDD[(Long, Double)]) + extends KernelMatrix[RDD[((Long, Long), Double)]] + with Logging + with Serializable { /** * Builds an approximate nonlinear feature map @@ -140,53 +207,13 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)] * of all the data points passed to the function. * * */ - def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] = { - - + override def eigenDecomposition(dimensions: Int = this.dimension.toInt): + (DenseVector[Double], DenseMatrix[Double]) = { logInfo("Eigenvalue decomposition of the kernel matrix using ARPACK.") - val decomposition = EigenValueDecomposition + EigenValueDecomposition .symmetricEigs( - multiplyKernelMatrixOn, + SVMKernel.multiplyKernelMatrixBy(kernel), dimension.toInt, dimensions, 0.0001, 300) - - logInfo("Applying Nystrom formula to calculate feature map of kernel matrix") - - /* - * Get row number i of the - * Kernel Matrix - * */ - val rows = kernel.groupBy((couple) => { - couple._1._1 - }) - - /* - * Join the each row i with the - * target label for point i. - * */ - val temp = labels.join(rows) - - /* - * Now for each data point, - * calculate n dimensions of the - * feature map where n is the number - * of eigenvalues/vectors obtained from - * the Eigen Decomposition. - * - * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, n, K(k, x)*eigenvector(i)(k)) - * */ - temp.map((datapoint) => { - val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i => - val eigenvector = decomposition._2(::, i) - val eigenvalue = decomposition._1(i) - var acc = 0.0 - datapoint._2._2.foreach((p) => - acc += (p._2 * eigenvector(p._1._2.toInt)/Math.sqrt(eigenvalue)) - ) - acc - } - new LabeledPoint(datapoint._2._1, Vectors.fromBreeze(y)) - }) - } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala index 73bcfa3aab30e..78ffbda08b3d8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala @@ -18,16 +18,15 @@ package org.apache.spark.mllib.prototype import org.apache.spark.mllib.kernels.DensityKernel +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.linalg.Vector /** * Models a general entropy measure. * Any entropy measure would require a * probability distribution */ -abstract class EntropyMeasure extends Measure[Vector] -with Serializable{ +abstract class EntropyMeasure extends Measure[LabeledPoint] with Serializable { protected val density: DensityKernel @@ -43,7 +42,7 @@ with Serializable{ * @return The entropy of the data set. * */ - def entropy[K](data: RDD[(K, Vector)]): Double + def entropy[K](data: RDD[(K, LabeledPoint)]): Double - override def evaluate[K](data: RDD[(K, Vector)]): Double = this.entropy(data) + override def evaluate[K](data: RDD[(K, LabeledPoint)]): Double = this.entropy(data) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala index 1543919c1fe53..34d94544c26a4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala @@ -17,8 +17,8 @@ package org.apache.spark.mllib.prototype -import org.apache.spark.{SparkContext, Logging} -import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.Logging +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD /** @@ -26,27 +26,29 @@ import org.apache.spark.rdd.RDD * subset selector */ abstract class EntropySelector - extends SubsetSelector[(Long, Vector)] with Serializable + extends SubsetSelector[(Long, LabeledPoint)] + with Serializable with Logging { protected val measure: EntropyMeasure protected val delta: Double protected val MAX_ITERATIONS: Int } -class GreedyEntropySelector(m: EntropyMeasure, - del: Double = 0.0001, - max: Int = 5000) - extends EntropySelector with Serializable +class GreedyEntropySelector( + m: EntropyMeasure, + del: Double = 0.0001, + max: Int = 5000) + extends EntropySelector + with Serializable with Logging { override protected val measure: EntropyMeasure = m override protected val delta: Double = del override protected val MAX_ITERATIONS: Int = max - override def selectPrototypes(data: RDD[(Long, Vector)], - M: Int): RDD[(Long, Vector)] = { - - val context = data.context + override def selectPrototypes( + data: RDD[(Long, LabeledPoint)], + M: Int): RDD[(Long, LabeledPoint)] = { /* * Draw an initial sample of M points @@ -56,7 +58,7 @@ class GreedyEntropySelector(m: EntropyMeasure, * will use as a prototype set to * to each iteration * */ - + logInfo("Initializing the working set, by drawing randomly from the training set") val workingset = data.keys.takeSample(false, M) val r = scala.util.Random @@ -71,6 +73,7 @@ class GreedyEntropySelector(m: EntropyMeasure, var newEntropy: Double = 0.0 var d: Double = Double.NegativeInfinity var rand: Int = 0 + logInfo("Starting iterative, entropy based greedy subset selection") do { /* * Randomly select a point from @@ -96,7 +99,7 @@ class GreedyEntropySelector(m: EntropyMeasure, * */ d = newEntropy - oldEntropy - if(d < 0) { + if(d > 0) { /* * Improvement in entropy so * keep the updated working set @@ -119,7 +122,7 @@ class GreedyEntropySelector(m: EntropyMeasure, it += 1 } while(math.abs(d) >= this.delta && it <= this.MAX_ITERATIONS) - + logInfo("Working set obtained, now starting process of packaging it as an RDD") //Time to return the final working set data.filter((p) => workingset.contains(p._1)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala index d2fcbaef381e8..3613dba8a723e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala @@ -19,18 +19,20 @@ package org.apache.spark.mllib.prototype import breeze.linalg.DenseVector import org.apache.spark.Logging import org.apache.spark.mllib.kernels.DensityKernel +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.linalg.{Vectors, Vector} /** * Implements the quadratic Renyi Entropy */ class QuadraticRenyiEntropy(dist: DensityKernel) - extends EntropyMeasure with Serializable with Logging { + extends EntropyMeasure + with Serializable + with Logging { val log_e = scala.math.log _ val sqrt = scala.math.sqrt _ - override protected val density: DensityKernel = dist /** @@ -48,13 +50,11 @@ class QuadraticRenyiEntropy(dist: DensityKernel) * parameter 'density'. * */ - override def entropy[K](data: RDD[(K, Vector)]): Double = { - val dim = data.first()._2.size + override def entropy[K](data: RDD[(K, LabeledPoint)]): Double = { + val dim = data.first()._2.features.size val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2)) -1*log_e(data.cartesian(data).map((couple) => - density.evaluate( - Vectors.fromBreeze(couple._1._2.toBreeze :/ root_two), - Vectors.fromBreeze(couple._2._2.toBreeze :/ root_two) - )).reduce((a,b) => a + b)) + density.eval(Vectors.fromBreeze(couple._1._2.features.toBreeze :/ root_two - + couple._2._2.features.toBreeze :/ root_two))).reduce((a,b) => a + b)) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala index 26f163ada25c2..c03ce34d0ce6c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -16,13 +16,13 @@ */ package org.apache.spark.mllib.kernels -import breeze.linalg.norm +import org.scalatest.FunSuite import org.apache.spark.mllib.classification.SVMSuite import org.apache.spark.mllib.prototype.{QuadraticRenyiEntropy, GreedyEntropySelector} import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.scalatest.FunSuite class KernelSuite extends FunSuite with MLlibTestSparkContext { + test("Testing evaluate function of Polynomial and RBF Functions"){ val nPoints = 100 @@ -76,18 +76,22 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(mappedData, nPoints) assert(mappedData.count() == nPoints) - val mappedFeaturespoly = kernelMatrixpoly.buildFeatureMap(3) - val mappedFeaturesrbf = kernelMatrixRBF.buildFeatureMap(5) + val mappedFeaturespoly = poly.featureMapping( + kernelMatrixpoly.eigenDecomposition(99) + )(mappedData)(mappedData) + val mappedFeaturesrbf = rbf.featureMapping( + kernelMatrixRBF.eigenDecomposition(99) + )(mappedData)(mappedData) - assert(mappedFeaturespoly.filter((point) => point.features.size == 3).count() == 100) - assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100) + assert(mappedFeaturespoly.filter((point) => point._2.features.size == 99).count() == 100) + assert(mappedFeaturesrbf.filter((point) => point._2.features.size == 99).count() == 100) } test("Testing optimal bandwidth calculation on Gaussian Kernel" + " and maximum entropy subset selection"){ - val nPoints = 10000 - + val nPoints = 1000 + val subsetSize = 100 // NOTE: Intercept should be small for generating equal 0s and 1s val A = 0.01 val B = -1.5 @@ -110,9 +114,63 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) val subsetRDD = subsetsel.selectPrototypes( - newIndexedRDD, - 100) + SVMKernel.indexedRDD(testRDD), + subsetSize) + + assert(subsetRDD.count() == subsetSize) + } + + test("Testing rbf kernel with subset selection and feature map extraction") { + val nPoints = 1000 + val nDimensions = 5 + val subsetSize = 100 + val unZip = SVMKernel.unzipIndexedData _ + + // NOTE: Intercept should be small for generating equal 0s and 1s + val A = 0.01 + val B = -1.5 + val C = 1.0 + + val testData = SVMSuite.generateSVMInput( + A, + Array[Double](B, C), + nPoints, + 42) + + val testRDD = sc.parallelize(testData, 2) + + val newtestRDD = testRDD.map(_.features) + newtestRDD.cache() + val kern = new GaussianDensityKernel() + kern.optimalBandwidth(newtestRDD) + newtestRDD.unpersist() + val mappedData = SVMKernel.indexedRDD(testRDD) + mappedData.cache() + + val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern) + val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) + val subsetRDD = subsetsel.selectPrototypes( + mappedData, + subsetSize) + + val rbf = new RBFKernel(0.8) + subsetRDD.cache() + + val kernelMatrixRBF = rbf.buildKernelMatrixasRDD( + SVMKernel.indexedRDD(unZip(subsetRDD)), + subsetSize) + + val featureMap = rbf.featureMapping( + kernelMatrixRBF.eigenDecomposition(nDimensions) + )(subsetRDD) _ + + val mappedFeaturesrbf = featureMap(mappedData) + + mappedFeaturesrbf.cache() + mappedData.unpersist() + + assert(mappedFeaturesrbf.count() == nPoints) + assert(mappedFeaturesrbf.first()._2.features.size == nDimensions) - assert(subsetRDD.count() == 100) } } From 9367dc14f672976781a9ea6bdf54d503104e6d2a Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Sun, 1 Feb 2015 14:29:27 +0100 Subject: [PATCH 12/14] Minor edits to kernel test suite. --- .../org/apache/spark/mllib/kernels/KernelSuite.scala | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala index c03ce34d0ce6c..b45980f7bd972 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala @@ -85,7 +85,6 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { assert(mappedFeaturespoly.filter((point) => point._2.features.size == 99).count() == 100) assert(mappedFeaturesrbf.filter((point) => point._2.features.size == 99).count() == 100) - } test("Testing optimal bandwidth calculation on Gaussian Kernel" + @@ -110,8 +109,8 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { newIndexedRDD.cache() newtestRDD.unpersist() - val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern) - val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) + val entropy = new QuadraticRenyiEntropy(kern) + val subsetsel = new GreedyEntropySelector(entropy) val subsetRDD = subsetsel.selectPrototypes( SVMKernel.indexedRDD(testRDD), @@ -147,8 +146,8 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext { val mappedData = SVMKernel.indexedRDD(testRDD) mappedData.cache() - val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern) - val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy) + val entropy = new QuadraticRenyiEntropy(kern) + val subsetsel = new GreedyEntropySelector(entropy) val subsetRDD = subsetsel.selectPrototypes( mappedData, subsetSize) From 7f4dfae134b915e9c882700c1df695dbaaaee489 Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Mon, 13 Apr 2015 16:54:28 +0200 Subject: [PATCH 13/14] Minor comment clean up --- .../org/apache/spark/mllib/kernels/GaussianDensityKernel.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala index 6de1c51c89df4..2fef3ee89f224 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala @@ -99,7 +99,7 @@ class GaussianDensityKernel * the AMISE bandwidth yet and we use this estimator * as a means to get the AMISE bandwidth) * - * @param kernel The RDD containing the kernel matrix + * @param kernel The RDD containing the matrix * consisting of pairs Xi - Xj, where Xi and Xj * are drawn from the original data set. * From 6b95548db3bbf180c330ee69121b371bcbc0f83e Mon Sep 17 00:00:00 2001 From: mandar2812 Date: Tue, 14 Apr 2015 02:10:13 +0200 Subject: [PATCH 14/14] Scala style check changes --- .../spark/mllib/kernels/DensityKernel.scala | 2 +- .../mllib/kernels/GaussianDensityKernel.scala | 19 +++++++------------ .../apache/spark/mllib/kernels/Kernel.scala | 6 +----- .../spark/mllib/kernels/KernelEstimator.scala | 2 +- .../spark/mllib/kernels/SVMKernel.scala | 4 ++-- .../mllib/prototype/EntropySelector.scala | 12 ++++++------ 6 files changed, 18 insertions(+), 27 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala index 7f8b7a06af7cc..8ee4a45556cf0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala @@ -35,4 +35,4 @@ trait DensityKernel extends Kernel with Serializable { protected def derivative(n: Int, x: Double): Double - } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala index 2fef3ee89f224..23260b74f6516 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala @@ -70,7 +70,7 @@ class GaussianDensityKernel this.bandwidth = b } - override def eval(x: Vector) = evalWithBandwidth(x, this.bandwidth) + override def eval(x: Vector):Double = evalWithBandwidth(x, this.bandwidth) /** * Calculates the derivative at point x for the Gaussian @@ -139,37 +139,32 @@ class GaussianDensityKernel override def optimalBandwidth(data: RDD[Vector]): Unit = { val dataSize: Long = data.count() - //First calculate variance of all dimensions + // First calculate variance of all dimensions val columnStats = Statistics.colStats(data) // And then the standard deviation val colvar = columnStats.variance.toBreeze val colstd = colvar.map((v) => sqrt(v)) - //Now calculate the initial estimates of R(f^6) and R(f^8) - - /*val Rf6: DenseVector[Double] = DenseVector.tabulate(colstd.size)( - (i) => -15.0*pow(colstd(i), -7.0)/(16*sqrt(Pi)))*/ + // Now calculate the initial estimates of R(f^6) and R(f^8) val Rf8: DenseVector[Double] = DenseVector.tabulate(colstd.size)( (i) => 105*pow(colstd(i), -9.0)/(32*sqrt(Pi))) /* * Use the earlier result to calculate - * h1 and h2 bandwidths for each dimension + * h2, the bandwidth for each dimension * */ - /*val h1: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) => - pow(-2*this.derivative(4, 0.0)/(dataSize*this.mu*Rf6(i)), 1/7))*/ val h2: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) => pow(-2*this.derivative(6, 0.0)/(dataSize*this.mu*Rf8(i)), 1/9)) /* - * Use h1 and h2 to calculate more - * refined estimates of R(f^6) and R(f^8) + * Use h2 to calculate more + * refined estimates of R(f^6) * */ - //Get an 0-indexed version of the original data set + // Get an 0-indexed version of the original data set val mappedData = SVMKernel.indexedRDD(data) /* diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala index 3d945fa6e22b5..13f50077744e3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala @@ -16,16 +16,13 @@ */ package org.apache.spark.mllib.kernels -import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.regression.LabeledPoint /** * Declares a trait Kernel which would serve * as a base trait for all classes implementing * Machine Learning Kernels. - * - **/ - + * */ trait Kernel { /** @@ -36,7 +33,6 @@ trait Kernel { * @param y a local Vector. * * @return the value of the Kernel function. - * * */ def evaluate(x: LabeledPoint, y: LabeledPoint): Double } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala index 03cc504bc34c3..1af34acd8668c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala @@ -34,7 +34,7 @@ trait KernelEstimator extends Logging { /** * Calculate the AMISE (Asymptotic Mean Integrated Square Error) * optimal bandwidth assignment by 'solve the equation plug in method' - **/ + * */ def optimalBandwidth(data: RDD[Vector]): Unit } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala index a4a11dc53e2d1..4c26ca07c560e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala @@ -112,12 +112,12 @@ object SVMKernel extends Logging with Serializable { val result: DenseVector[Double] = DenseVector.tabulate(v.length)( (i) => { - //Get row number i of kernel + // Get row number i of kernel val row = DenseVector.apply(kernel .filter((point) => i == point._1._1) .map((p) => p._2) .collect()) - //dot product with v + // dot product with v vbr.value.t * row } ) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala index 34d94544c26a4..3a0245a3c2853 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala @@ -64,12 +64,12 @@ class GreedyEntropySelector( val r = scala.util.Random var it: Int = 0 - //All the elements not in the working set + // All the elements not in the working set var newDataset: RDD[Long] = data.keys.filter((p) => !workingset.contains(p)) - //Existing best value of the entropy + // Existing best value of the entropy var oldEntropy: Double = this.measure.evaluate(data.filter((point) => workingset.contains(point._1))) - //Store the value of entropy after an element swap + // Store the value of entropy after an element swap var newEntropy: Double = 0.0 var d: Double = Double.NegativeInfinity var rand: Int = 0 @@ -85,9 +85,9 @@ class GreedyEntropySelector( val point2 = newDataset.takeSample(false, 1).apply(0) - //Update the working set + // Update the working set workingset(rand) = point2 - //Calculate the new entropy + // Calculate the new entropy newEntropy = this.measure.evaluate(data.filter((p) => workingset.contains(p._1))) @@ -123,7 +123,7 @@ class GreedyEntropySelector( } while(math.abs(d) >= this.delta && it <= this.MAX_ITERATIONS) logInfo("Working set obtained, now starting process of packaging it as an RDD") - //Time to return the final working set + // Time to return the final working set data.filter((p) => workingset.contains(p._1)) }