From 9d7b00b08423dae455e57e1f00fa6471f8fd901c Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Mon, 22 Dec 2014 02:47:54 +0100
Subject: [PATCH 01/14] Initial commit for Kernels feature: Contains class
 heirarchies, implementation of the Nystrom method for feature map
 extractions, RBF and Polynomial Kernels. Also a bare bones test suite for SVM
 Kernels is included

---
 .../spark/mllib/kernels/DensityKernel.scala   |  45 +++++
 .../apache/spark/mllib/kernels/Kernel.scala   |  40 ++++
 .../mllib/kernels/PolynomialKernel.scala      |  47 +++++
 .../spark/mllib/kernels/RBFKernel.scala       |  48 +++++
 .../spark/mllib/kernels/SVMKernel.scala       | 180 ++++++++++++++++++
 .../spark/mllib/kernels/KernelSuite.scala     |  84 ++++++++
 6 files changed, 444 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
new file mode 100644
index 0000000000000..abab0778f588f
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.rdd.RDD
+
+/**
+ * Abstract class which can be extended to
+ * implement various Multivariate Density
+ * Kernels.
+ */
+abstract class DensityKernel extends Kernel with Serializable{
+  protected var bandwidth: Vector
+
+  protected def setBandwidth(b: Vector): Unit = {
+    this.bandwidth = b
+  }
+
+  protected def eval(x: Vector):Double
+
+  override def evaluate(x: Vector, y: Vector): Double =
+    this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze)))
+
+  /**
+   * Calculates the AMISE (Asymptotic Mean Integrated Square Error)
+   * optimal bandwidth assignment by 'solve the equation plug in method'
+   **/
+  protected def optimalBandwidth(data: RDD[Vector]): Vector
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
new file mode 100644
index 0000000000000..4730bf5dc5854
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.mllib.linalg._
+/**
+ * Declares a trait Kernel which would serve
+ * as a base trait for all classes implementing
+ * Machine Learning Kernels.
+ *
+ **/
+
+trait Kernel {
+
+  /**
+   * Evaluates the value of the kernel given two
+   * vectorial parameters
+   *
+   * @param x a local Vector.
+   * @param y a local Vector.
+   *
+   * @return the value of the Kernel function.
+   *
+   * */
+  def evaluate(x: Vector, y:Vector): Double
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
new file mode 100644
index 0000000000000..cc449477ca173
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/**
+ * Standard Polynomial SVM Kernel
+ * of the form K(Xi,Xj) = (Xi^T * Xj + d)^r
+ */
+class PolynomialKernel(private var degree: Int,
+                       private var offset: Double)
+  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{
+
+  def setDegree(d: Int): Unit = {
+    this.degree = d
+  }
+
+  def setOffset(o: Int): Unit = {
+    this.offset = o
+  }
+
+  override def evaluate(x: linalg.Vector, y: linalg.Vector): Double =
+    Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree)
+
+  override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+                                      length: Long):
+  KernelMatrix[RDD[((Int, Int), Double)]] =
+    SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
new file mode 100644
index 0000000000000..e6ff82033f493
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import breeze.linalg.{DenseVector, norm}
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/**
+ * Standard RBF Kernel of the form
+ * K(Xi,Xj) = exp(-||Xi - Xj||**2/2*bandwidth**2)
+ */
+
+class RBFKernel(private var bandwidth: Double)
+  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{
+
+  def setBandwidth(d: Double): Unit = {
+    this.bandwidth = d
+  }
+
+  override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = {
+    val diff: linalg.Vector = Vectors.fromBreeze(x.toBreeze - y.toBreeze)
+    Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2)))
+  }
+
+  override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+                                      length: Long):
+  KernelMatrix[RDD[((Int, Int), Double)]] =
+    SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
new file mode 100644
index 0000000000000..c5b70fc35ef37
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import breeze.linalg.{DenseVector}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.mllib.linalg
+import org.apache.spark.{SparkContext, Logging}
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+/**
+ * Defines an abstract class outlines the basic
+ * functionality requirements of an SVM Kernel
+ */
+abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
+
+  def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+                             length: Long): KernelMatrix[T]
+
+}
+
+/**
+ * Defines a global singleton object
+ * [[SVMKernel]] which has useful functions
+ * while working with [[RDD]] of [[LabeledPoint]]
+ *
+ * */
+object SVMKernel extends Logging with Serializable{
+
+  /**
+   * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]]
+   *
+   * @param data : An [[RDD]] of [[LabeledPoint]]
+   *
+   * @return An (Int, LabeledPoint) Key-Value RDD indexed
+   *         from 0 to data.count() - 1
+   * */
+  def indexedRDD(data: RDD[LabeledPoint]): RDD[(Int, LabeledPoint)] = {
+    val sc = data.context
+    val i = sc.accumulator(-1, "Raw Data Index")
+
+    data.map((point) => {
+      i+=1
+      (i.localValue, point)
+    })
+  }
+
+
+  /**
+   * This function constructs an [[SVMKernelMatrix]]
+   *
+   * @param mappedData The indexed [[RDD]] of [[LabeledPoint]]
+   * @param length Length of the indexed [[RDD]]
+   * @param eval A function which calculates the value of the Kernel
+   *             given two Vectors [[linalg.Vector]].
+   *
+   * @return An [[SVMKernelMatrix]] object.
+   *
+   * */
+  def buildSVMKernelMatrix(mappedData: RDD[(Int, LabeledPoint)],
+                           length: Long,
+                           eval: (linalg.Vector, linalg.Vector) =>  Double):
+  KernelMatrix[RDD[((Int, Int), Double)]] = {
+
+    logInfo("Constructing key-value representation of kernel matrix.")
+    logInfo("Dimension: " + length + " x " + length)
+
+    val labels = mappedData.map((p) => (p._1, p._2.label))
+    val kernel = mappedData.cartesian(mappedData)
+      .map((prod) => ((prod._1._1, prod._2._1),
+      eval(prod._1._2.features, prod._2._2.features)))
+    kernel.cache()
+    new SVMKernelMatrix(kernel, length, labels)
+  }
+
+}
+
+/**
+ * Defines a trait which outlines the basic
+ * functionality of Kernel Matrices.
+ * */
+trait KernelMatrix[T] extends Serializable{
+  protected val kernel: T
+  def buildFeatureMap(dimensions: Int): RDD[LabeledPoint]
+  def getKernelMatrix(): T = this.kernel
+}
+
+class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
+                      private val dimension: Long,
+                      private val labels: RDD[(Int, Double)])
+  extends KernelMatrix[RDD[((Int, Int), Double)]] with Logging with Serializable {
+
+  override def getKernelMatrix():RDD[((Int, Int), Double)] = this.kernel
+
+  /**
+   * Defines a function value which
+   * calculates the multiplication of
+   * the Kernel Matrix with a Breeze
+   * Vector and returns the result as a
+   * Breeze DenseVector.
+   * */
+  val multiplyKernelMatrixOn =
+    (v :breeze.linalg.DenseVector[Double]) => {
+      val vbr = kernel.context.broadcast(v)
+      v.mapPairs((i, _) => {
+        //Get row number i of kernel
+        val row = kernel.filter((point) => i == point._1._1)
+        //multiply with v
+        var sum = kernel.context.accumulator(0.00, "Multiplication product, vector")
+        row.foreach((rownum) => {
+          sum += rownum._2*vbr.value(rownum._1._2)
+        })
+        sum.value
+      })
+    }
+
+  /**
+   * Builds an approximate nonlinear feature map
+   * which corresponds to an SVM Kernel. This is
+   * done using the Nystrom method i.e. approximating
+   * the eigenvalues and eigenvectors of the Kernel
+   * matrix of a given RDD
+   *
+   * @param dimensions The effective number of dimensions
+   *                   to be calculated in the feature map
+   *
+   * @return An RDD containing the non linear feature map
+   *         of all the data points passed to the function.
+   *
+   * */
+  def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] = {
+
+
+    logInfo("Eigenvalue decomposition of the kernel matrix using ARPACK.")
+    val decomposition = EigenValueDecomposition
+      .symmetricEigs(
+        multiplyKernelMatrixOn,
+        dimension.toInt, dimensions,
+        0.0001, 300)
+
+    logInfo("Applying Nystrom formula to calculate feature map of kernel matrix")
+
+    //TODO: Comment here
+
+    val rows = kernel.groupBy((couple) => {
+      couple._1._1
+    })
+
+    val temp = labels.join(rows)
+
+    temp.map((datapoint) => {
+      val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i =>
+        val eigenvector = decomposition._2(::, i)
+        val eigenvalue = decomposition._1(i)
+        var acc = 0.0
+        datapoint._2._2.foreach((p) =>
+          acc += (p._2 * eigenvector(p._1._2)/Math.sqrt(eigenvalue))
+        )
+        acc
+      }
+      new LabeledPoint(datapoint._2._1, Vectors.fromBreeze(y))
+    })
+
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
new file mode 100644
index 0000000000000..8be61ee158f73
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.mllib.classification.SVMSuite
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.scalatest.FunSuite
+
+class KernelSuite extends FunSuite with MLlibTestSparkContext {
+  test("Testing evaluate function of Polynomial and RBF Functions"){
+
+    val nPoints = 100
+
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
+    val B = -1.5
+    val C = 1.0
+
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42)
+
+    val testRDD = sc.parallelize(testData)
+
+    val rbf = new RBFKernel(1.00)
+    val poly = new PolynomialKernel(2, 1.5)
+
+    val mappedData = SVMKernel.indexedRDD(testRDD)
+
+    val kernelMatrix1 = poly.buildKernelMatrixasRDD(mappedData, nPoints)
+    val kernelMatrix2 = rbf.buildKernelMatrixasRDD(mappedData, nPoints)
+
+    assert(mappedData.count() == nPoints)
+    assert(kernelMatrix1.getKernelMatrix().filter((point) =>
+      point._2.isNaN || point._2.isInfinite)
+      .count() == 0)
+    assert(kernelMatrix2.getKernelMatrix().filter((point) =>
+      point._2.isNaN || point._2.isInfinite)
+      .count() == 0)
+
+  }
+
+  test("Testing building of feature map from the kernel matrix"){
+    val nPoints = 100
+
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
+    val B = -1.5
+    val C = 1.0
+
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42)
+
+    val testRDD = sc.parallelize(testData, 2)
+    testRDD.cache()
+
+    val rbf = new RBFKernel(1.00)
+    val poly = new PolynomialKernel(5, 1.5)
+    val mappedData = SVMKernel.indexedRDD(testRDD)
+
+    mappedData.cache()
+    val kernelMatrixpoly = poly.buildKernelMatrixasRDD(mappedData, nPoints)
+    val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(mappedData, nPoints)
+
+    assert(mappedData.count() == nPoints)
+    val mappedFeaturespoly = kernelMatrixpoly.buildFeatureMap(3)
+    val mappedFeaturesrbf = kernelMatrixRBF.buildFeatureMap(5)
+
+    assert(mappedFeaturespoly.filter((point) => point.features.size == 3).count() == 100)
+    assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100)
+
+  }
+}

From 2e29b123665b19f3f06d5f4f90d7884646da71a6 Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Wed, 21 Jan 2015 18:34:29 +0100
Subject: [PATCH 02/14] 1) Changes to class hierarchy 2) Commenting Nystrom
 approximation code

---
 .../spark/mllib/kernels/DensityKernel.scala   |  9 +-
 .../mllib/kernels/GaussianDensityKernel.scala | 82 +++++++++++++++++++
 .../spark/mllib/kernels/RBFKernel.scala       |  2 +-
 .../spark/mllib/kernels/SVMKernel.scala       | 19 ++++-
 4 files changed, 102 insertions(+), 10 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
index abab0778f588f..baec10ceceeaa 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
@@ -24,14 +24,9 @@ import org.apache.spark.rdd.RDD
  * implement various Multivariate Density
  * Kernels.
  */
-abstract class DensityKernel extends Kernel with Serializable{
-  protected var bandwidth: Vector
+trait DensityKernel extends Kernel with Serializable{
 
-  protected def setBandwidth(b: Vector): Unit = {
-    this.bandwidth = b
-  }
-
-  protected def eval(x: Vector):Double
+  def eval(x: Vector):Double
 
   override def evaluate(x: Vector, y: Vector): Double =
     this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze)))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
new file mode 100644
index 0000000000000..e133ed69d78bd
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.kernels
+
+import breeze.linalg.{norm, DenseVector}
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
+
+
+class GaussianDensityKernel(protected var bandwidth: Vector)
+  extends DensityKernel with Logging {
+
+  def setBandwidth(b: linalg.Vector): Unit = {
+    this.bandwidth = b
+  }
+
+  override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth)
+
+  private def evalWithBandwidth(x: Vector, b: Vector): Double = {
+    val exp = scala.math.exp _
+    val pow = scala.math.pow _ _
+    val sqrt = scala.math.sqrt _
+    val Pi = scala.math.Pi
+
+    val buff = x.toBreeze
+
+    val normalizedbuff: breeze.linalg.DenseVector[Double] = DenseVector.tabulate(
+      b.size)(
+        (i) => buff(i)/b.apply(i)
+      )
+    exp(-1*pow(norm(normalizedbuff), 2)/2)/sqrt(Pi * 2)
+  }
+
+  //TODO: Implement derivative function
+  private def derivative(n: Int)(x: Vector): Vector = {
+    Vectors.zeros(x.size)
+  }
+
+  //TODO: Implement R integral
+  private def R(r: Int, pilot: Vector): Vector = {
+    Vectors.zeros(pilot.size)
+  }
+
+  //TODO: Implement mu integral
+  private val mu: Vector = Vectors.zeros(this.bandwidth.size)
+
+  override def optimalBandwidth(data: RDD[Vector]): Unit = {
+
+    //First calculate variance of all dimensions
+    val columnStats = Statistics.colStats(data)
+
+    val colvariance = columnStats.variance
+
+    //Now calculate the initial estimates of R(f'''') and R(f'''''')
+
+    //Use the earlier result to calculate h1 and h2 bandwidths for each
+    //dimension separately
+
+    //Use the Sheathon and Jones 1991 result to calculate
+    //the optimal bandwidth
+
+    //Vectors.fromBreeze(breeze.linalg.DenseVector.ones[Double](10))
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
index e6ff82033f493..fac11439fc192 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD
  */
 
 class RBFKernel(private var bandwidth: Double)
-  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{
+  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable {
 
   def setBandwidth(d: Double): Unit = {
     this.bandwidth = d
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
index c5b70fc35ef37..ff4d0d054e6bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -155,14 +155,29 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
 
     logInfo("Applying Nystrom formula to calculate feature map of kernel matrix")
 
-    //TODO: Comment here
-
+    /*
+    * Get row number i of the
+    * Kernel Matrix
+    * */
     val rows = kernel.groupBy((couple) => {
       couple._1._1
     })
 
+    /*
+    * Join the each row i with the
+    * target label for point i.
+    * */
     val temp = labels.join(rows)
 
+    /*
+    * Now for each data point,
+    * calculate n dimensions of the
+    * feature map where n is the number
+    * of eigenvalues/vectors obtained from
+    * the Eigen Decomposition.
+    *
+    * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, n, K(k, x)*eigenvector(i)(k))
+    * */
     temp.map((datapoint) => {
       val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i =>
         val eigenvector = decomposition._2(::, i)

From 5239082426911032df7228163a94eca8281e5676 Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Tue, 27 Jan 2015 17:03:26 +0100
Subject: [PATCH 03/14] Added optimal bandwidth selection procedure for
 Gaussian Density Kernel

---
 .../spark/mllib/kernels/DensityKernel.scala   |  11 +-
 .../mllib/kernels/GaussianDensityKernel.scala | 181 +++++++++++++++---
 .../spark/mllib/kernels/KernelEstimator.scala |  41 ++++
 .../mllib/kernels/PolynomialKernel.scala      |   6 +-
 .../spark/mllib/kernels/RBFKernel.scala       |   6 +-
 .../spark/mllib/kernels/SVMKernel.scala       |  25 ++-
 .../spark/mllib/kernels/KernelSuite.scala     |  18 ++
 7 files changed, 232 insertions(+), 56 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
index baec10ceceeaa..6658c5343ace3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.mllib.kernels
 
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
 /**
@@ -24,17 +25,15 @@ import org.apache.spark.rdd.RDD
  * implement various Multivariate Density
  * Kernels.
  */
-trait DensityKernel extends Kernel with Serializable{
+trait DensityKernel extends Kernel with Serializable  {
 
   def eval(x: Vector):Double
 
   override def evaluate(x: Vector, y: Vector): Double =
     this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze)))
 
-  /**
-   * Calculates the AMISE (Asymptotic Mean Integrated Square Error)
-   * optimal bandwidth assignment by 'solve the equation plug in method'
-   **/
-  protected def optimalBandwidth(data: RDD[Vector]): Vector
+  protected def derivative(n: Int, x: Double): Double
 
+  protected val mu: Double
+  protected val r: Double
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
index e133ed69d78bd..6c7621e11a208 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
@@ -23,60 +23,179 @@ import org.apache.spark.mllib.linalg
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.rdd.RDD
+import breeze.numerics.{sqrt => brsqrt}
 
 
-class GaussianDensityKernel(protected var bandwidth: Vector)
-  extends DensityKernel with Logging {
+class GaussianDensityKernel
+  extends DensityKernel with KernelEstimator with Logging with Serializable {
+  private val exp = scala.math.exp _
+  private val pow = scala.math.pow _
+  private val sqrt = scala.math.sqrt _
+  private val Pi = scala.math.Pi
+  protected var bandwidth: Vector = Vectors.zeros(10)
 
-  def setBandwidth(b: linalg.Vector): Unit = {
-    this.bandwidth = b
-  }
-
-  override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth)
+  private def evalForDimension(x: Double, pilot: Double): Double =
+    exp(-1*pow(x/pilot, 2)/2)/sqrt(Pi * 2)
 
   private def evalWithBandwidth(x: Vector, b: Vector): Double = {
-    val exp = scala.math.exp _
-    val pow = scala.math.pow _ _
-    val sqrt = scala.math.sqrt _
-    val Pi = scala.math.Pi
-
+    assert(x.size == b.size,
+      "Dimensions of vector x and the bandwidth of the kernel must match")
     val buff = x.toBreeze
-
+    val bw = b.toBreeze
     val normalizedbuff: breeze.linalg.DenseVector[Double] = DenseVector.tabulate(
-      b.size)(
-        (i) => buff(i)/b.apply(i)
+      bw.size)(
+        (i) => buff(i)/bw(i)
       )
-    exp(-1*pow(norm(normalizedbuff), 2)/2)/sqrt(Pi * 2)
+    exp(-1*pow(norm(normalizedbuff), 2)/2)/pow(sqrt(Pi * 2), b.size)
   }
 
-  //TODO: Implement derivative function
-  private def derivative(n: Int)(x: Vector): Vector = {
-    Vectors.zeros(x.size)
+  /*
+  * Calculate the value of the hermite polynomials 
+  * tail recursively. This is needed to calculate 
+  * the Gaussian derivatives at a point x.
+  * */
+  private def hermite(n: Int, x: Double): Double = {
+    def hermiteHelper(k: Int, x: Double, a: Double, b: Double): Double =
+      k match {
+        case 0 => a
+        case 1 => b
+        case _ => hermiteHelper(k-1, x, b, x*b - (k-1)*a)
+      }
+    hermiteHelper(n, x, 1, x)
   }
 
-  //TODO: Implement R integral
-  private def R(r: Int, pilot: Vector): Vector = {
-    Vectors.zeros(pilot.size)
+  def setBandwidth(b: linalg.Vector): Unit = {
+    this.bandwidth = b
   }
 
-  //TODO: Implement mu integral
-  private val mu: Vector = Vectors.zeros(this.bandwidth.size)
+  override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth)
+
 
+  /**
+   * Calculates the derivative at point x for the Gaussian
+   * Density Kernel, for only one dimension.
+   *
+   * @param n The number of times the gaussian has to be differentiated
+   * @param x The point x at which the derivative has to evaluated
+   * @return The value of the nth derivative at the point x
+   * */
+  override def derivative(n: Int, x: Double): Double = {
+    (1/sqrt(2*Pi))*(1/pow(-1.0,n))*exp(-1*pow(x,2)/2)*hermite(n, x)
+  }
+
+  /**
+   * Implementation of the estimator for the R integral
+   * for a multivariate Gaussian Density Kernel.
+   * Evaluates R(D_r(f(x))).
+   *
+   * @param r the degree of the derivative of the kernel
+   *
+   * @param N The size of the original data set from which
+   *          kernel matrix [[RDD]] was constructed.
+   *
+   * @param pilot The pilot bandwidth to be used to calculate
+   *              the kernel values. (Note that we have not calculated
+   *              the AMISE bandwidth yet and we use this estimator
+   *              as a means to get the AMISE bandwidth)
+   *
+   * @param kernel The RDD containing the kernel matrix
+   *               consisting of pairs Xi - Xj, where Xi and Xj
+   *               are drawn from the original data set.
+   *
+   * @return R the estimated value of the integral of the square
+   *         of the rth derivative of the kernel over the Real domain.
+   * */
+  override protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
+                           kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = {
+
+
+    /*
+    * Apply map to get values of the derivative of the kernel
+    * at various point pairs.
+    * */
+    val kernelNormalized = kernel.map((couple) =>
+      (couple._1, Vectors.fromBreeze(DenseVector.tabulate(pilot.size)
+        ((i) => (1/(pow(N, 2)*pow(pilot(i), r + 1)))*
+          this.derivative(r, couple._2.toBreeze(i)/pilot(i)))
+      )))
+
+    /*
+    * Sum up all the individual values to get the estimated
+    * value of the integral
+    * */
+    val integralvalue = kernelNormalized.reduce((a,b) =>
+      ((0,0), Vectors.fromBreeze(a._2.toBreeze + b._2.toBreeze)))
+
+    integralvalue._2.toBreeze
+  }
+
+  override protected val mu = (1/4)*(1/sqrt(Pi))
+  override protected val r = (1/2)*(1/sqrt(Pi))
+
+  /**
+   * Use the Sheather and Jones plug-in
+   * method to calculate the optimal bandwidth
+   * http://bit.ly/1EoBY7q
+   *
+   * */
   override def optimalBandwidth(data: RDD[Vector]): Unit = {
+    val dataSize: Long = data.count()
 
     //First calculate variance of all dimensions
     val columnStats = Statistics.colStats(data)
+    // And then the standard deviation
+    val colvar = columnStats.variance.toBreeze
+    val colstd = colvar.map((v) => sqrt(v))
+
+    //Now calculate the initial estimates of R(f^6) and R(f^8)
+
+    /*val Rf6: DenseVector[Double] = DenseVector.tabulate(colstd.size)(
+      (i) => -15.0*pow(colstd(i), -7.0)/(16*sqrt(Pi)))*/
+
+    val Rf8: DenseVector[Double] = DenseVector.tabulate(colstd.size)(
+      (i) => 105*pow(colstd(i), -9.0)/(32*sqrt(Pi)))
+
+    /*
+    * Use the earlier result to calculate
+    * h1 and h2 bandwidths for each dimension
+    * */
+
+    /*val h1: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) =>
+      pow(-2*this.derivative(4, 0.0)/(dataSize*this.mu*Rf6(i)), 1/7))*/
+    val h2: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) =>
+      pow(-2*this.derivative(6, 0.0)/(dataSize*this.mu*Rf8(i)), 1/9))
+
+
+    /*
+    * Use h1 and h2 to calculate more
+    * refined estimates of R(f^6) and R(f^8)
+    * */
+
+    //Get an 0-indexed version of the original data set
+    val mappedData = SVMKernel.indexedRDD(data)
+
+    /*
+    * Apply cartesian product on the indexed data set
+    * and then map it to a RDD of type [(i,j), Xi - Xj]
+    * */
+    val kernel = mappedData.cartesian(mappedData)
+      .map((prod) => ((prod._1._1, prod._2._1),
+      Vectors.fromBreeze(prod._1._2.toBreeze -
+        prod._2._2.toBreeze))
+      )
+    kernel.cache()
+
 
-    val colvariance = columnStats.variance
+    val newRf6: breeze.linalg.Vector[Double] = this.R(8, dataSize, h2, kernel)
 
-    //Now calculate the initial estimates of R(f'''') and R(f'''''')
+    val hAMSE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) =>
+      pow((-2*this.derivative(4, 0.0))/(dataSize*this.mu*newRf6(i)), 1/7))
 
-    //Use the earlier result to calculate h1 and h2 bandwidths for each
-    //dimension separately
+    val newRf4: breeze.linalg.Vector[Double] = this.R(4, dataSize, hAMSE, kernel)
 
-    //Use the Sheathon and Jones 1991 result to calculate
-    //the optimal bandwidth
+    val hAMISE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) =>
+      pow(this.r/(dataSize*this.mu*this.mu*newRf4(i)), 1/5))
 
-    //Vectors.fromBreeze(breeze.linalg.DenseVector.ones[Double](10))
+    this.bandwidth = Vectors.fromBreeze(hAMISE)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
new file mode 100644
index 0000000000000..dedbd4c3a6264
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/**
+ * Trait defining the basic behavior
+ * of a Kernel density estimator
+ */
+trait KernelEstimator extends Logging {
+
+  protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
+                  kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double]
+
+
+  /**
+   * Calculate the AMISE (Asymptotic Mean Integrated Square Error)
+   * optimal bandwidth assignment by 'solve the equation plug in method'
+   **/
+  def optimalBandwidth(data: RDD[Vector]): Unit
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
index cc449477ca173..51abfad97c060 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
@@ -27,7 +27,7 @@ import org.apache.spark.rdd.RDD
  */
 class PolynomialKernel(private var degree: Int,
                        private var offset: Double)
-  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{
+  extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable{
 
   def setDegree(d: Int): Unit = {
     this.degree = d
@@ -40,8 +40,8 @@ class PolynomialKernel(private var degree: Int,
   override def evaluate(x: linalg.Vector, y: linalg.Vector): Double =
     Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree)
 
-  override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+  override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
                                       length: Long):
-  KernelMatrix[RDD[((Int, Int), Double)]] =
+  KernelMatrix[RDD[((Long, Long), Double)]] =
     SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
index fac11439fc192..d5c9285e8c394 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD
  */
 
 class RBFKernel(private var bandwidth: Double)
-  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable {
+  extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable {
 
   def setBandwidth(d: Double): Unit = {
     this.bandwidth = d
@@ -40,9 +40,9 @@ class RBFKernel(private var bandwidth: Double)
     Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2)))
   }
 
-  override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+  override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
                                       length: Long):
-  KernelMatrix[RDD[((Int, Int), Double)]] =
+  KernelMatrix[RDD[((Long, Long), Double)]] =
     SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
index ff4d0d054e6bb..74bec1050f913 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -16,8 +16,7 @@
  */
 package org.apache.spark.mllib.kernels
 
-import breeze.linalg.{DenseVector}
-import org.apache.spark.annotation.DeveloperApi
+import breeze.linalg.DenseVector
 import org.apache.spark.mllib.linalg
 import org.apache.spark.{SparkContext, Logging}
 import org.apache.spark.mllib.linalg._
@@ -29,7 +28,7 @@ import org.apache.spark.rdd.RDD
  */
 abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
 
-  def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+  def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
                              length: Long): KernelMatrix[T]
 
 }
@@ -50,9 +49,9 @@ object SVMKernel extends Logging with Serializable{
    * @return An (Int, LabeledPoint) Key-Value RDD indexed
    *         from 0 to data.count() - 1
    * */
-  def indexedRDD(data: RDD[LabeledPoint]): RDD[(Int, LabeledPoint)] = {
+  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = {
     val sc = data.context
-    val i = sc.accumulator(-1, "Raw Data Index")
+    val i: org.apache.spark.Accumulator[Long] = sc.accumulator(-1, "Raw Data Index")
 
     data.map((point) => {
       i+=1
@@ -72,10 +71,10 @@ object SVMKernel extends Logging with Serializable{
    * @return An [[SVMKernelMatrix]] object.
    *
    * */
-  def buildSVMKernelMatrix(mappedData: RDD[(Int, LabeledPoint)],
+  def buildSVMKernelMatrix(mappedData: RDD[(Long, LabeledPoint)],
                            length: Long,
                            eval: (linalg.Vector, linalg.Vector) =>  Double):
-  KernelMatrix[RDD[((Int, Int), Double)]] = {
+  KernelMatrix[RDD[((Long, Long), Double)]] = {
 
     logInfo("Constructing key-value representation of kernel matrix.")
     logInfo("Dimension: " + length + " x " + length)
@@ -100,12 +99,12 @@ trait KernelMatrix[T] extends Serializable{
   def getKernelMatrix(): T = this.kernel
 }
 
-class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
+class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)],
                       private val dimension: Long,
-                      private val labels: RDD[(Int, Double)])
-  extends KernelMatrix[RDD[((Int, Int), Double)]] with Logging with Serializable {
+                      private val labels: RDD[(Long, Double)])
+  extends KernelMatrix[RDD[((Long, Long), Double)]] with Logging with Serializable {
 
-  override def getKernelMatrix():RDD[((Int, Int), Double)] = this.kernel
+  override def getKernelMatrix():RDD[((Long, Long), Double)] = this.kernel
 
   /**
    * Defines a function value which
@@ -123,7 +122,7 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
         //multiply with v
         var sum = kernel.context.accumulator(0.00, "Multiplication product, vector")
         row.foreach((rownum) => {
-          sum += rownum._2*vbr.value(rownum._1._2)
+          sum += rownum._2*vbr.value(rownum._1._2.toInt)
         })
         sum.value
       })
@@ -184,7 +183,7 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
         val eigenvalue = decomposition._1(i)
         var acc = 0.0
         datapoint._2._2.foreach((p) =>
-          acc += (p._2 * eigenvector(p._1._2)/Math.sqrt(eigenvalue))
+          acc += (p._2 * eigenvector(p._1._2.toInt)/Math.sqrt(eigenvalue))
         )
         acc
       }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
index 8be61ee158f73..6ac6ae9b33e18 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -81,4 +81,22 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100)
 
   }
+
+  test("Testing optimal bandwidth calculation on Gaussian Kernel"){
+    val nPoints = 100
+
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
+    val B = -1.5
+    val C = 1.0
+
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42)
+
+    val testRDD = sc.parallelize(testData, 2)
+    val newtestRDD = testRDD.map((p) => p.features)
+    newtestRDD.cache()
+    val kern = new GaussianDensityKernel()
+    kern.optimalBandwidth(newtestRDD)
+    assert(kern.eval(newtestRDD.first()) != Double.NaN)
+  }
 }

From bf1e9829b798ca76d408f6d1a66220720948c038 Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Thu, 29 Jan 2015 14:42:53 +0100
Subject: [PATCH 04/14] Entropy based subset selection done, unit tests passing

---
 .../spark/mllib/kernels/SVMKernel.scala       |  20 ++-
 .../mllib/prototype/EntropyMeasure.scala      |  49 +++++++
 .../mllib/prototype/EntropySelector.scala     | 127 ++++++++++++++++++
 .../spark/mllib/prototype/Measure.scala       |  28 ++++
 .../prototype/QuadraticRenyiEntropy.scala     |  60 +++++++++
 .../mllib/prototype/SubsetSelector.scala      |  28 ++++
 .../spark/mllib/kernels/KernelSuite.scala     |  20 ++-
 7 files changed, 319 insertions(+), 13 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
index 74bec1050f913..5321e55a07c70 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -39,7 +39,7 @@ abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
  * while working with [[RDD]] of [[LabeledPoint]]
  *
  * */
-object SVMKernel extends Logging with Serializable{
+object SVMKernel extends Logging with Serializable {
 
   /**
    * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]]
@@ -49,15 +49,7 @@ object SVMKernel extends Logging with Serializable{
    * @return An (Int, LabeledPoint) Key-Value RDD indexed
    *         from 0 to data.count() - 1
    * */
-  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = {
-    val sc = data.context
-    val i: org.apache.spark.Accumulator[Long] = sc.accumulator(-1, "Raw Data Index")
-
-    data.map((point) => {
-      i+=1
-      (i.localValue, point)
-    })
-  }
+  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = data.zipWithIndex().map((p) => (p._2, p._1))
 
 
   /**
@@ -87,13 +79,19 @@ object SVMKernel extends Logging with Serializable{
     new SVMKernelMatrix(kernel, length, labels)
   }
 
+  def zipVectorsWithLabels(mappedData: RDD[(Long, Vector)],
+                           labels: RDD[(Long, Double)]):
+  RDD[LabeledPoint] = mappedData.join(labels).map((point) =>
+    new LabeledPoint(point._2._2, point._2._1))
+
+
 }
 
 /**
  * Defines a trait which outlines the basic
  * functionality of Kernel Matrices.
  * */
-trait KernelMatrix[T] extends Serializable{
+trait KernelMatrix[T] extends Serializable {
   protected val kernel: T
   def buildFeatureMap(dimensions: Int): RDD[LabeledPoint]
   def getKernelMatrix(): T = this.kernel
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
new file mode 100644
index 0000000000000..73bcfa3aab30e
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.prototype
+
+import org.apache.spark.mllib.kernels.DensityKernel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.Vector
+
+/**
+ * Models a general entropy measure.
+ * Any entropy measure would require a
+ * probability distribution
+ */
+abstract class EntropyMeasure extends Measure[Vector]
+with Serializable{
+
+  protected val density: DensityKernel
+
+  /**
+   * Given a probability distribution for
+   * the data set, calculate the entropy of
+   * the data set with respect to the given
+   * distribution.
+   *
+   * @param data The data set whose entropy is
+   *             required.
+   *
+   * @return The entropy of the data set.
+   * */
+
+  def entropy[K](data: RDD[(K, Vector)]): Double
+
+  override def evaluate[K](data: RDD[(K, Vector)]): Double = this.entropy(data)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
new file mode 100644
index 0000000000000..1543919c1fe53
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.prototype
+
+import org.apache.spark.{SparkContext, Logging}
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.rdd.RDD
+
+/**
+ * Basic skeleton of an entropy based
+ * subset selector
+ */
+abstract class EntropySelector
+  extends SubsetSelector[(Long, Vector)] with Serializable
+  with Logging {
+  protected val measure: EntropyMeasure
+  protected val delta: Double
+  protected val MAX_ITERATIONS: Int
+}
+
+class GreedyEntropySelector(m: EntropyMeasure,
+                            del: Double = 0.0001,
+                            max: Int = 5000)
+  extends EntropySelector with Serializable
+  with Logging {
+
+  override protected val measure: EntropyMeasure = m
+  override protected val delta: Double = del
+  override protected val MAX_ITERATIONS: Int =  max
+
+  override def selectPrototypes(data: RDD[(Long, Vector)],
+                                M: Int): RDD[(Long, Vector)] = {
+
+    val context = data.context
+
+    /*
+    * Draw an initial sample of M points
+    * from data without replacement.
+    *
+    * Define a working set which we
+    * will use as a prototype set to
+    * to each iteration
+    * */
+
+    val workingset = data.keys.takeSample(false, M)
+
+    val r = scala.util.Random
+    var it: Int = 0
+
+    //All the elements not in the working set
+    var newDataset: RDD[Long] = data.keys.filter((p) => !workingset.contains(p))
+    //Existing best value of the entropy
+    var oldEntropy: Double = this.measure.evaluate(data.filter((point) =>
+      workingset.contains(point._1)))
+    //Store the value of entropy after an element swap
+    var newEntropy: Double = 0.0
+    var d: Double = Double.NegativeInfinity
+    var rand: Int = 0
+    do {
+      /*
+       * Randomly select a point from
+       * the working set as well as data
+       * and then swap them.
+       * */
+      rand = r.nextInt(workingset.length - 1)
+      val point1 = workingset.apply(rand)
+
+      val point2 = newDataset.takeSample(false, 1).apply(0)
+
+      //Update the working set
+      workingset(rand) = point2
+      //Calculate the new entropy
+      newEntropy = this.measure.evaluate(data.filter((p) =>
+        workingset.contains(p._1)))
+
+      /*
+      * Calculate the change in entropy,
+      * if it has improved then keep the
+      * swap, otherwise revert to existing
+      * working set.
+      * */
+      d = newEntropy - oldEntropy
+
+      if(d < 0) {
+        /*
+        * Improvement in entropy so
+        * keep the updated working set
+        * as it is and update the
+        * variable 'newDataset'
+        * */
+        oldEntropy = newEntropy
+        newDataset = data.keys.filter((p) => !workingset.contains(p))
+      } else {
+        /*
+        * No improvement in entropy
+        * so revert the working set
+        * to its initial state. Leave
+        * the variable newDataset as
+        * it is.
+        * */
+        workingset(rand) = point1
+      }
+
+      it += 1
+    } while(math.abs(d) >= this.delta &&
+      it <= this.MAX_ITERATIONS)
+
+    //Time to return the final working set
+    data.filter((p) => workingset.contains(p._1))
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala
new file mode 100644
index 0000000000000..80d466fb18ee3
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.prototype
+
+import org.apache.spark.rdd.RDD
+
+/**
+ * Trait which outlines basic behavior
+ * of a subset utility measure.
+ */
+trait Measure[T] {
+  def evaluate[K](data: RDD[(K, T)]): Double
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
new file mode 100644
index 0000000000000..d2fcbaef381e8
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.prototype
+
+import breeze.linalg.DenseVector
+import org.apache.spark.Logging
+import org.apache.spark.mllib.kernels.DensityKernel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Implements the quadratic Renyi Entropy
+ */
+class QuadraticRenyiEntropy(dist: DensityKernel)
+  extends EntropyMeasure with Serializable with Logging {
+
+  val log_e = scala.math.log _
+  val sqrt = scala.math.sqrt _
+
+  override protected val density: DensityKernel = dist
+
+  /**
+   * Calculate the quadratic Renyi entropy
+   * within a distribution specific
+   * proportionality constant. This can
+   * be used to compare the entropy values of
+   * different sets of data on the same
+   * distribution.
+   *
+   * @param data The data set whose entropy is
+   *             required.
+   * @return The entropy of the dataset assuming
+   *         it is distributed as given by the value
+   *         parameter 'density'.
+   * */
+
+  override def entropy[K](data: RDD[(K, Vector)]): Double = {
+    val dim = data.first()._2.size
+    val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2))
+    -1*log_e(data.cartesian(data).map((couple) =>
+      density.evaluate(
+        Vectors.fromBreeze(couple._1._2.toBreeze :/ root_two),
+        Vectors.fromBreeze(couple._2._2.toBreeze :/ root_two)
+      )).reduce((a,b) => a + b))
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala
new file mode 100644
index 0000000000000..c96bcb0dd3a3e
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.prototype
+
+import org.apache.spark.rdd.RDD
+
+/**
+ * Defines the characteristics of
+ * a subset selector
+ */
+trait SubsetSelector[T] extends Serializable{
+  def selectPrototypes(data: RDD[T], M: Int): RDD[T]
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
index 6ac6ae9b33e18..26f163ada25c2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -16,7 +16,9 @@
  */
 package org.apache.spark.mllib.kernels
 
+import breeze.linalg.norm
 import org.apache.spark.mllib.classification.SVMSuite
+import org.apache.spark.mllib.prototype.{QuadraticRenyiEntropy, GreedyEntropySelector}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.scalatest.FunSuite
 
@@ -82,8 +84,9 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
 
   }
 
-  test("Testing optimal bandwidth calculation on Gaussian Kernel"){
-    val nPoints = 100
+  test("Testing optimal bandwidth calculation on Gaussian Kernel" +
+    " and maximum entropy subset selection"){
+    val nPoints = 10000
 
     // NOTE: Intercept should be small for generating equal 0s and 1s
     val A = 0.01
@@ -98,5 +101,18 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     val kern = new GaussianDensityKernel()
     kern.optimalBandwidth(newtestRDD)
     assert(kern.eval(newtestRDD.first()) != Double.NaN)
+
+    val newIndexedRDD = SVMKernel.indexedRDD(newtestRDD)
+    newIndexedRDD.cache()
+    newtestRDD.unpersist()
+
+    val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern)
+    val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
+
+    val subsetRDD = subsetsel.selectPrototypes(
+      newIndexedRDD,
+      100)
+
+    assert(subsetRDD.count() == 100)
   }
 }

From 43f85c20f1b8ca8ed7324f828df3a88295d26a3d Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Sun, 1 Feb 2015 00:26:03 +0100
Subject: [PATCH 05/14] 1) Optimized code for feature map extraction, kernel
 matrix multiplication 2) Code indentation changes

---
 .../spark/mllib/kernels/DensityKernel.scala   |  11 +-
 .../mllib/kernels/GaussianDensityKernel.scala |  25 +--
 .../apache/spark/mllib/kernels/Kernel.scala   |   4 +-
 .../spark/mllib/kernels/KernelEstimator.scala |   9 +-
 .../mllib/kernels/PolynomialKernel.scala      |  20 +-
 .../spark/mllib/kernels/RBFKernel.scala       |  18 +-
 .../spark/mllib/kernels/SVMKernel.scala       | 205 ++++++++++--------
 .../mllib/prototype/EntropyMeasure.scala      |   9 +-
 .../mllib/prototype/EntropySelector.scala     |  31 +--
 .../prototype/QuadraticRenyiEntropy.scala     |  18 +-
 .../spark/mllib/kernels/KernelSuite.scala     |  80 ++++++-
 11 files changed, 258 insertions(+), 172 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
index 6658c5343ace3..7f8b7a06af7cc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
@@ -18,7 +18,6 @@ package org.apache.spark.mllib.kernels
 
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.rdd.RDD
 
 /**
  * Abstract class which can be extended to
@@ -26,14 +25,14 @@ import org.apache.spark.rdd.RDD
  * Kernels.
  */
 trait DensityKernel extends Kernel with Serializable  {
+  protected val mu: Double
+  protected val r: Double
 
   def eval(x: Vector):Double
 
-  override def evaluate(x: Vector, y: Vector): Double =
-    this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze)))
+  override def evaluate(x: LabeledPoint, y: LabeledPoint): Double =
+    this.eval(Vectors.fromBreeze(x.features.toBreeze.-=(y.features.toBreeze)))
 
   protected def derivative(n: Int, x: Double): Double
 
-  protected val mu: Double
-  protected val r: Double
-}
+  }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
index 6c7621e11a208..6de1c51c89df4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
@@ -19,20 +19,22 @@ package org.apache.spark.mllib.kernels
 
 import breeze.linalg.{norm, DenseVector}
 import org.apache.spark.Logging
-import org.apache.spark.mllib.linalg
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.rdd.RDD
-import breeze.numerics.{sqrt => brsqrt}
-
 
 class GaussianDensityKernel
-  extends DensityKernel with KernelEstimator with Logging with Serializable {
+  extends DensityKernel
+  with KernelEstimator
+  with Logging
+  with Serializable {
   private val exp = scala.math.exp _
   private val pow = scala.math.pow _
   private val sqrt = scala.math.sqrt _
   private val Pi = scala.math.Pi
   protected var bandwidth: Vector = Vectors.zeros(10)
+  override protected val mu = (1/4)*(1/sqrt(Pi))
+  override protected val r = (1/2)*(1/sqrt(Pi))
 
   private def evalForDimension(x: Double, pilot: Double): Double =
     exp(-1*pow(x/pilot, 2)/2)/sqrt(Pi * 2)
@@ -64,12 +66,11 @@ class GaussianDensityKernel
     hermiteHelper(n, x, 1, x)
   }
 
-  def setBandwidth(b: linalg.Vector): Unit = {
+  def setBandwidth(b: Vector): Unit = {
     this.bandwidth = b
   }
 
-  override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth)
-
+  override def eval(x: Vector) = evalWithBandwidth(x, this.bandwidth)
 
   /**
    * Calculates the derivative at point x for the Gaussian
@@ -105,9 +106,9 @@ class GaussianDensityKernel
    * @return R the estimated value of the integral of the square
    *         of the rth derivative of the kernel over the Real domain.
    * */
-  override protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
-                           kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = {
-
+  override protected def R(
+      r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
+      kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = {
 
     /*
     * Apply map to get values of the derivative of the kernel
@@ -129,9 +130,6 @@ class GaussianDensityKernel
     integralvalue._2.toBreeze
   }
 
-  override protected val mu = (1/4)*(1/sqrt(Pi))
-  override protected val r = (1/2)*(1/sqrt(Pi))
-
   /**
    * Use the Sheather and Jones plug-in
    * method to calculate the optimal bandwidth
@@ -185,7 +183,6 @@ class GaussianDensityKernel
       )
     kernel.cache()
 
-
     val newRf6: breeze.linalg.Vector[Double] = this.R(8, dataSize, h2, kernel)
 
     val hAMSE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
index 4730bf5dc5854..3d945fa6e22b5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
@@ -17,6 +17,8 @@
 package org.apache.spark.mllib.kernels
 
 import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+
 /**
  * Declares a trait Kernel which would serve
  * as a base trait for all classes implementing
@@ -36,5 +38,5 @@ trait Kernel {
    * @return the value of the Kernel function.
    *
    * */
-  def evaluate(x: Vector, y:Vector): Double
+  def evaluate(x: LabeledPoint, y: LabeledPoint): Double
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
index dedbd4c3a6264..03cc504bc34c3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.mllib.kernels
 
 import org.apache.spark.Logging
-import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector}
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
 
 /**
@@ -28,9 +27,9 @@ import org.apache.spark.rdd.RDD
  */
 trait KernelEstimator extends Logging {
 
-  protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
-                  kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double]
-
+  protected def R(
+      r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
+      kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double]
 
   /**
    * Calculate the AMISE (Asymptotic Mean Integrated Square Error)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
index 51abfad97c060..828aca0b48570 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
@@ -17,7 +17,6 @@
 package org.apache.spark.mllib.kernels
 
 import org.apache.spark.Logging
-import org.apache.spark.mllib.linalg
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
@@ -25,9 +24,12 @@ import org.apache.spark.rdd.RDD
  * Standard Polynomial SVM Kernel
  * of the form K(Xi,Xj) = (Xi^T * Xj + d)^r
  */
-class PolynomialKernel(private var degree: Int,
-                       private var offset: Double)
-  extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable{
+class PolynomialKernel(
+    private var degree: Int,
+    private var offset: Double)
+  extends SVMKernel[RDD[((Long, Long), Double)]]
+  with Logging
+  with Serializable{
 
   def setDegree(d: Int): Unit = {
     this.degree = d
@@ -37,11 +39,11 @@ class PolynomialKernel(private var degree: Int,
     this.offset = o
   }
 
-  override def evaluate(x: linalg.Vector, y: linalg.Vector): Double =
-    Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree)
+  override def evaluate(x: LabeledPoint, y: LabeledPoint): Double =
+    Math.pow(x.features.toBreeze dot y.features.toBreeze + this.offset, this.degree)
 
-  override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
-                                      length: Long):
-  KernelMatrix[RDD[((Long, Long), Double)]] =
+  override def buildKernelMatrixasRDD(
+      mappedData: RDD[(Long, LabeledPoint)],
+      length: Long): KernelMatrix[RDD[((Long, Long), Double)]] =
     SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
index d5c9285e8c394..3b78b159d43b1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
@@ -16,10 +16,8 @@
  */
 package org.apache.spark.mllib.kernels
 
-import breeze.linalg.{DenseVector, norm}
 import org.apache.spark.Logging
-import org.apache.spark.mllib.linalg
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
@@ -29,20 +27,22 @@ import org.apache.spark.rdd.RDD
  */
 
 class RBFKernel(private var bandwidth: Double)
-  extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable {
+  extends SVMKernel[RDD[((Long, Long), Double)]]
+  with Logging
+  with Serializable {
 
   def setBandwidth(d: Double): Unit = {
     this.bandwidth = d
   }
 
-  override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = {
-    val diff: linalg.Vector = Vectors.fromBreeze(x.toBreeze - y.toBreeze)
+  override def evaluate(x: LabeledPoint, y: LabeledPoint): Double = {
+    val diff: Vector = Vectors.fromBreeze(x.features.toBreeze - y.features.toBreeze)
     Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2)))
   }
 
-  override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
-                                      length: Long):
-  KernelMatrix[RDD[((Long, Long), Double)]] =
+  override def buildKernelMatrixasRDD(
+      mappedData: RDD[(Long, LabeledPoint)],
+      length: Long): KernelMatrix[RDD[((Long, Long), Double)]] =
     SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
index 5321e55a07c70..a4a11dc53e2d1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -16,9 +16,8 @@
  */
 package org.apache.spark.mllib.kernels
 
-import breeze.linalg.DenseVector
-import org.apache.spark.mllib.linalg
-import org.apache.spark.{SparkContext, Logging}
+import breeze.linalg.{DenseVector, DenseMatrix}
+import org.apache.spark.Logging
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
@@ -28,9 +27,67 @@ import org.apache.spark.rdd.RDD
  */
 abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
 
-  def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
-                             length: Long): KernelMatrix[T]
+  /**
+   * Build the kernel matrix of the prototype vectors
+   *
+   * @param mappedData The prototype vectors/points
+   *
+   * @param length The number of points
+   *
+   * @return A [[KernelMatrix]] object
+   *
+   *
+   * */
+  def buildKernelMatrixasRDD(
+      mappedData: RDD[(Long, LabeledPoint)],
+      length: Long): KernelMatrix[T]
+
+  /**
+   * Builds an approximate nonlinear feature map
+   * which corresponds to an SVM Kernel. This is
+   * done using the Nystrom method i.e. approximating
+   * the eigenvalues and eigenvectors of the Kernel
+   * matrix of a given RDD
+   *
+   * For each data point,
+   * calculate m dimensions of the
+   * feature map where m is the number
+   * of eigenvalues/vectors obtained from
+   * the Eigen Decomposition.
+   *
+   * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, m, K(k, x)*eigenvector(i)(k))
+   *
+   * @param decomposition The Eigenvalue decomposition calculated
+   *                      from the kernel matrix of the prototype
+   *                      subset.
+   * @param prototypes The prototype subset.
+   *
+   * @param data  The dataset [[RDD]] on which the feature map
+   *              is to be applied.
+   *
+   * */
+  def featureMapping(decomposition: (DenseVector[Double], DenseMatrix[Double]))
+                    (prototypes: RDD[(Long, LabeledPoint)])
+                    (data: RDD[(Long, LabeledPoint)])
+  : RDD[(Long, LabeledPoint)] = {
+
+    logInfo("Calculating the Non Linear feature map of data set")
+
+    data.cartesian(prototypes)
+      .map((couple) => {
+      val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i =>
+        var eigenvector = 0.0
+        if (couple._2._1.toInt < decomposition._1.length) {
+          eigenvector = decomposition._2(couple._2._1.toInt, i)
+        }
 
+        val eigenvalue = decomposition._1(i)
+        this.evaluate(couple._1._2, couple._2._2) * eigenvector/Math.sqrt(eigenvalue)
+      }
+        (couple._1._1, (couple._1._2.label, y))
+      }).reduceByKey((veca, vecb) => (veca._1, veca._2 + vecb._2))
+      .map((p) => (p._1, new LabeledPoint(p._2._1, Vectors.fromBreeze(p._2._2))))
+  }
 }
 
 /**
@@ -41,6 +98,32 @@ abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
  * */
 object SVMKernel extends Logging with Serializable {
 
+  /**
+   * Defines a function value which
+   * calculates the multiplication of
+   * the Kernel Matrix with a Breeze
+   * Vector and returns the result as a
+   * Breeze DenseVector.
+   * */
+  def multiplyKernelMatrixBy(kernel: RDD[((Long, Long), Double)])
+                            (v :breeze.linalg.DenseVector[Double]):
+  DenseVector[Double] = {
+      val vbr = kernel.context.broadcast(v)
+      val result: DenseVector[Double] =
+        DenseVector.tabulate(v.length)(
+          (i) => {
+            //Get row number i of kernel
+            val row = DenseVector.apply(kernel
+              .filter((point) => i == point._1._1)
+              .map((p) => p._2)
+              .collect())
+            //dot product with v
+            vbr.value.t * row
+          }
+        )
+      result
+    }
+
   /**
    * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]]
    *
@@ -49,8 +132,8 @@ object SVMKernel extends Logging with Serializable {
    * @return An (Int, LabeledPoint) Key-Value RDD indexed
    *         from 0 to data.count() - 1
    * */
-  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = data.zipWithIndex().map((p) => (p._2, p._1))
-
+  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = 
+    data.zipWithIndex().map((p) => (p._2, p._1))
 
   /**
    * This function constructs an [[SVMKernelMatrix]]
@@ -58,14 +141,15 @@ object SVMKernel extends Logging with Serializable {
    * @param mappedData The indexed [[RDD]] of [[LabeledPoint]]
    * @param length Length of the indexed [[RDD]]
    * @param eval A function which calculates the value of the Kernel
-   *             given two Vectors [[linalg.Vector]].
+   *             given two Labeled Points [[LabeledPoint]].
    *
    * @return An [[SVMKernelMatrix]] object.
    *
    * */
-  def buildSVMKernelMatrix(mappedData: RDD[(Long, LabeledPoint)],
-                           length: Long,
-                           eval: (linalg.Vector, linalg.Vector) =>  Double):
+  def buildSVMKernelMatrix(
+      mappedData: RDD[(Long, LabeledPoint)],
+      length: Long,
+      eval: (LabeledPoint, LabeledPoint) =>  Double):
   KernelMatrix[RDD[((Long, Long), Double)]] = {
 
     logInfo("Constructing key-value representation of kernel matrix.")
@@ -74,17 +158,19 @@ object SVMKernel extends Logging with Serializable {
     val labels = mappedData.map((p) => (p._1, p._2.label))
     val kernel = mappedData.cartesian(mappedData)
       .map((prod) => ((prod._1._1, prod._2._1),
-      eval(prod._1._2.features, prod._2._2.features)))
+      eval(prod._1._2, prod._2._2)))
     kernel.cache()
     new SVMKernelMatrix(kernel, length, labels)
   }
 
-  def zipVectorsWithLabels(mappedData: RDD[(Long, Vector)],
-                           labels: RDD[(Long, Double)]):
-  RDD[LabeledPoint] = mappedData.join(labels).map((point) =>
+  def zipVectorsWithLabels(
+      mappedData: RDD[(Long, Vector)],
+      labels: RDD[(Long, Double)]): RDD[LabeledPoint] = 
+    mappedData.join(labels).map((point) =>
     new LabeledPoint(point._2._2, point._2._1))
 
-
+  def unzipIndexedData(mappedData: RDD[(Long, LabeledPoint)]):
+  RDD[LabeledPoint] = mappedData.map((p) => p._2)
 }
 
 /**
@@ -93,38 +179,19 @@ object SVMKernel extends Logging with Serializable {
  * */
 trait KernelMatrix[T] extends Serializable {
   protected val kernel: T
-  def buildFeatureMap(dimensions: Int): RDD[LabeledPoint]
-  def getKernelMatrix(): T = this.kernel
-}
 
-class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)],
-                      private val dimension: Long,
-                      private val labels: RDD[(Long, Double)])
-  extends KernelMatrix[RDD[((Long, Long), Double)]] with Logging with Serializable {
+  def eigenDecomposition(dimensions: Int): (DenseVector[Double], DenseMatrix[Double])
 
-  override def getKernelMatrix():RDD[((Long, Long), Double)] = this.kernel
+  def getKernelMatrix(): T = this.kernel
+}
 
-  /**
-   * Defines a function value which
-   * calculates the multiplication of
-   * the Kernel Matrix with a Breeze
-   * Vector and returns the result as a
-   * Breeze DenseVector.
-   * */
-  val multiplyKernelMatrixOn =
-    (v :breeze.linalg.DenseVector[Double]) => {
-      val vbr = kernel.context.broadcast(v)
-      v.mapPairs((i, _) => {
-        //Get row number i of kernel
-        val row = kernel.filter((point) => i == point._1._1)
-        //multiply with v
-        var sum = kernel.context.accumulator(0.00, "Multiplication product, vector")
-        row.foreach((rownum) => {
-          sum += rownum._2*vbr.value(rownum._1._2.toInt)
-        })
-        sum.value
-      })
-    }
+class SVMKernelMatrix(
+    override protected val kernel: RDD[((Long, Long), Double)],
+    private val dimension: Long,
+    private val labels: RDD[(Long, Double)])
+  extends KernelMatrix[RDD[((Long, Long), Double)]]
+  with Logging
+  with Serializable {
 
   /**
    * Builds an approximate nonlinear feature map
@@ -140,53 +207,13 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)]
    *         of all the data points passed to the function.
    *
    * */
-  def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] = {
-
-
+  override def eigenDecomposition(dimensions: Int = this.dimension.toInt):
+  (DenseVector[Double], DenseMatrix[Double]) = {
     logInfo("Eigenvalue decomposition of the kernel matrix using ARPACK.")
-    val decomposition = EigenValueDecomposition
+    EigenValueDecomposition
       .symmetricEigs(
-        multiplyKernelMatrixOn,
+        SVMKernel.multiplyKernelMatrixBy(kernel),
         dimension.toInt, dimensions,
         0.0001, 300)
-
-    logInfo("Applying Nystrom formula to calculate feature map of kernel matrix")
-
-    /*
-    * Get row number i of the
-    * Kernel Matrix
-    * */
-    val rows = kernel.groupBy((couple) => {
-      couple._1._1
-    })
-
-    /*
-    * Join the each row i with the
-    * target label for point i.
-    * */
-    val temp = labels.join(rows)
-
-    /*
-    * Now for each data point,
-    * calculate n dimensions of the
-    * feature map where n is the number
-    * of eigenvalues/vectors obtained from
-    * the Eigen Decomposition.
-    *
-    * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, n, K(k, x)*eigenvector(i)(k))
-    * */
-    temp.map((datapoint) => {
-      val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i =>
-        val eigenvector = decomposition._2(::, i)
-        val eigenvalue = decomposition._1(i)
-        var acc = 0.0
-        datapoint._2._2.foreach((p) =>
-          acc += (p._2 * eigenvector(p._1._2.toInt)/Math.sqrt(eigenvalue))
-        )
-        acc
-      }
-      new LabeledPoint(datapoint._2._1, Vectors.fromBreeze(y))
-    })
-
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
index 73bcfa3aab30e..78ffbda08b3d8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
@@ -18,16 +18,15 @@
 package org.apache.spark.mllib.prototype
 
 import org.apache.spark.mllib.kernels.DensityKernel
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.linalg.Vector
 
 /**
  * Models a general entropy measure.
  * Any entropy measure would require a
  * probability distribution
  */
-abstract class EntropyMeasure extends Measure[Vector]
-with Serializable{
+abstract class EntropyMeasure extends Measure[LabeledPoint] with Serializable {
 
   protected val density: DensityKernel
 
@@ -43,7 +42,7 @@ with Serializable{
    * @return The entropy of the data set.
    * */
 
-  def entropy[K](data: RDD[(K, Vector)]): Double
+  def entropy[K](data: RDD[(K, LabeledPoint)]): Double
 
-  override def evaluate[K](data: RDD[(K, Vector)]): Double = this.entropy(data)
+  override def evaluate[K](data: RDD[(K, LabeledPoint)]): Double = this.entropy(data)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
index 1543919c1fe53..34d94544c26a4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.mllib.prototype
 
-import org.apache.spark.{SparkContext, Logging}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.Logging
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
 /**
@@ -26,27 +26,29 @@ import org.apache.spark.rdd.RDD
  * subset selector
  */
 abstract class EntropySelector
-  extends SubsetSelector[(Long, Vector)] with Serializable
+  extends SubsetSelector[(Long, LabeledPoint)]
+  with Serializable
   with Logging {
   protected val measure: EntropyMeasure
   protected val delta: Double
   protected val MAX_ITERATIONS: Int
 }
 
-class GreedyEntropySelector(m: EntropyMeasure,
-                            del: Double = 0.0001,
-                            max: Int = 5000)
-  extends EntropySelector with Serializable
+class GreedyEntropySelector(
+    m: EntropyMeasure,
+    del: Double = 0.0001,
+    max: Int = 5000)
+  extends EntropySelector
+  with Serializable
   with Logging {
 
   override protected val measure: EntropyMeasure = m
   override protected val delta: Double = del
   override protected val MAX_ITERATIONS: Int =  max
 
-  override def selectPrototypes(data: RDD[(Long, Vector)],
-                                M: Int): RDD[(Long, Vector)] = {
-
-    val context = data.context
+  override def selectPrototypes(
+      data: RDD[(Long, LabeledPoint)],
+      M: Int): RDD[(Long, LabeledPoint)] = {
 
     /*
     * Draw an initial sample of M points
@@ -56,7 +58,7 @@ class GreedyEntropySelector(m: EntropyMeasure,
     * will use as a prototype set to
     * to each iteration
     * */
-
+    logInfo("Initializing the working set, by drawing randomly from the training set")
     val workingset = data.keys.takeSample(false, M)
 
     val r = scala.util.Random
@@ -71,6 +73,7 @@ class GreedyEntropySelector(m: EntropyMeasure,
     var newEntropy: Double = 0.0
     var d: Double = Double.NegativeInfinity
     var rand: Int = 0
+    logInfo("Starting iterative, entropy based greedy subset selection")
     do {
       /*
        * Randomly select a point from
@@ -96,7 +99,7 @@ class GreedyEntropySelector(m: EntropyMeasure,
       * */
       d = newEntropy - oldEntropy
 
-      if(d < 0) {
+      if(d > 0) {
         /*
         * Improvement in entropy so
         * keep the updated working set
@@ -119,7 +122,7 @@ class GreedyEntropySelector(m: EntropyMeasure,
       it += 1
     } while(math.abs(d) >= this.delta &&
       it <= this.MAX_ITERATIONS)
-
+    logInfo("Working set obtained, now starting process of packaging it as an RDD")
     //Time to return the final working set
     data.filter((p) => workingset.contains(p._1))
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
index d2fcbaef381e8..3613dba8a723e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
@@ -19,18 +19,20 @@ package org.apache.spark.mllib.prototype
 import breeze.linalg.DenseVector
 import org.apache.spark.Logging
 import org.apache.spark.mllib.kernels.DensityKernel
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.linalg.{Vectors, Vector}
 
 /**
  * Implements the quadratic Renyi Entropy
  */
 class QuadraticRenyiEntropy(dist: DensityKernel)
-  extends EntropyMeasure with Serializable with Logging {
+  extends EntropyMeasure
+  with Serializable
+  with Logging {
 
   val log_e = scala.math.log _
   val sqrt = scala.math.sqrt _
-
   override protected val density: DensityKernel = dist
 
   /**
@@ -48,13 +50,11 @@ class QuadraticRenyiEntropy(dist: DensityKernel)
    *         parameter 'density'.
    * */
 
-  override def entropy[K](data: RDD[(K, Vector)]): Double = {
-    val dim = data.first()._2.size
+  override def entropy[K](data: RDD[(K, LabeledPoint)]): Double = {
+    val dim = data.first()._2.features.size
     val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2))
     -1*log_e(data.cartesian(data).map((couple) =>
-      density.evaluate(
-        Vectors.fromBreeze(couple._1._2.toBreeze :/ root_two),
-        Vectors.fromBreeze(couple._2._2.toBreeze :/ root_two)
-      )).reduce((a,b) => a + b))
+      density.eval(Vectors.fromBreeze(couple._1._2.features.toBreeze :/ root_two -
+        couple._2._2.features.toBreeze :/ root_two))).reduce((a,b) => a + b))
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
index 26f163ada25c2..c03ce34d0ce6c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -16,13 +16,13 @@
  */
 package org.apache.spark.mllib.kernels
 
-import breeze.linalg.norm
+import org.scalatest.FunSuite
 import org.apache.spark.mllib.classification.SVMSuite
 import org.apache.spark.mllib.prototype.{QuadraticRenyiEntropy, GreedyEntropySelector}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.scalatest.FunSuite
 
 class KernelSuite extends FunSuite with MLlibTestSparkContext {
+
   test("Testing evaluate function of Polynomial and RBF Functions"){
 
     val nPoints = 100
@@ -76,18 +76,22 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(mappedData, nPoints)
 
     assert(mappedData.count() == nPoints)
-    val mappedFeaturespoly = kernelMatrixpoly.buildFeatureMap(3)
-    val mappedFeaturesrbf = kernelMatrixRBF.buildFeatureMap(5)
+    val mappedFeaturespoly = poly.featureMapping(
+      kernelMatrixpoly.eigenDecomposition(99)
+    )(mappedData)(mappedData)
+    val mappedFeaturesrbf = rbf.featureMapping(
+      kernelMatrixRBF.eigenDecomposition(99)
+    )(mappedData)(mappedData)
 
-    assert(mappedFeaturespoly.filter((point) => point.features.size == 3).count() == 100)
-    assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100)
+    assert(mappedFeaturespoly.filter((point) => point._2.features.size == 99).count() == 100)
+    assert(mappedFeaturesrbf.filter((point) => point._2.features.size == 99).count() == 100)
 
   }
 
   test("Testing optimal bandwidth calculation on Gaussian Kernel" +
     " and maximum entropy subset selection"){
-    val nPoints = 10000
-
+    val nPoints = 1000
+    val subsetSize = 100
     // NOTE: Intercept should be small for generating equal 0s and 1s
     val A = 0.01
     val B = -1.5
@@ -110,9 +114,63 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
 
     val subsetRDD = subsetsel.selectPrototypes(
-      newIndexedRDD,
-      100)
+      SVMKernel.indexedRDD(testRDD),
+      subsetSize)
+
+    assert(subsetRDD.count() == subsetSize)
+  }
+
+  test("Testing rbf kernel with subset selection and feature map extraction") {
+    val nPoints = 1000
+    val nDimensions = 5
+    val subsetSize = 100
+    val unZip = SVMKernel.unzipIndexedData _
+
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
+    val B = -1.5
+    val C = 1.0
+
+    val testData = SVMSuite.generateSVMInput(
+      A,
+      Array[Double](B, C),
+      nPoints,
+      42)
+
+    val testRDD = sc.parallelize(testData, 2)
+
+    val newtestRDD = testRDD.map(_.features)
+    newtestRDD.cache()
+    val kern = new GaussianDensityKernel()
+    kern.optimalBandwidth(newtestRDD)
+    newtestRDD.unpersist()
+    val mappedData = SVMKernel.indexedRDD(testRDD)
+    mappedData.cache()
+
+    val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern)
+    val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
+    val subsetRDD = subsetsel.selectPrototypes(
+      mappedData,
+      subsetSize)
+
+    val rbf = new RBFKernel(0.8)
+    subsetRDD.cache()
+
+    val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(
+      SVMKernel.indexedRDD(unZip(subsetRDD)),
+      subsetSize)
+
+    val featureMap = rbf.featureMapping(
+      kernelMatrixRBF.eigenDecomposition(nDimensions)
+    )(subsetRDD) _
+
+    val mappedFeaturesrbf = featureMap(mappedData)
+
+    mappedFeaturesrbf.cache()
+    mappedData.unpersist()
+
+    assert(mappedFeaturesrbf.count() == nPoints)
+    assert(mappedFeaturesrbf.first()._2.features.size == nDimensions)
 
-    assert(subsetRDD.count() == 100)
   }
 }

From 025d214802621e6a8f8b05732723bbd0025f136d Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Sun, 1 Feb 2015 14:29:27 +0100
Subject: [PATCH 06/14] Minor edits to kernel test suite.

---
 .../org/apache/spark/mllib/kernels/KernelSuite.scala     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
index c03ce34d0ce6c..b45980f7bd972 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -85,7 +85,6 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
 
     assert(mappedFeaturespoly.filter((point) => point._2.features.size == 99).count() == 100)
     assert(mappedFeaturesrbf.filter((point) => point._2.features.size == 99).count() == 100)
-
   }
 
   test("Testing optimal bandwidth calculation on Gaussian Kernel" +
@@ -110,8 +109,8 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     newIndexedRDD.cache()
     newtestRDD.unpersist()
 
-    val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern)
-    val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
+    val entropy = new QuadraticRenyiEntropy(kern)
+    val subsetsel = new GreedyEntropySelector(entropy)
 
     val subsetRDD = subsetsel.selectPrototypes(
       SVMKernel.indexedRDD(testRDD),
@@ -147,8 +146,8 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     val mappedData = SVMKernel.indexedRDD(testRDD)
     mappedData.cache()
 
-    val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern)
-    val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
+    val entropy = new QuadraticRenyiEntropy(kern)
+    val subsetsel = new GreedyEntropySelector(entropy)
     val subsetRDD = subsetsel.selectPrototypes(
       mappedData,
       subsetSize)

From 98d7730270d804e2140741ee0154cd03c2dde350 Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Mon, 22 Dec 2014 02:47:54 +0100
Subject: [PATCH 07/14] Initial commit for Kernels feature: Contains class
 heirarchies, implementation of the Nystrom method for feature map
 extractions, RBF and Polynomial Kernels. Also a bare bones test suite for SVM
 Kernels is included

---
 .../spark/mllib/kernels/DensityKernel.scala   |  45 +++++
 .../apache/spark/mllib/kernels/Kernel.scala   |  40 ++++
 .../mllib/kernels/PolynomialKernel.scala      |  47 +++++
 .../spark/mllib/kernels/RBFKernel.scala       |  48 +++++
 .../spark/mllib/kernels/SVMKernel.scala       | 180 ++++++++++++++++++
 .../spark/mllib/kernels/KernelSuite.scala     |  84 ++++++++
 6 files changed, 444 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
new file mode 100644
index 0000000000000..abab0778f588f
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.rdd.RDD
+
+/**
+ * Abstract class which can be extended to
+ * implement various Multivariate Density
+ * Kernels.
+ */
+abstract class DensityKernel extends Kernel with Serializable{
+  protected var bandwidth: Vector
+
+  protected def setBandwidth(b: Vector): Unit = {
+    this.bandwidth = b
+  }
+
+  protected def eval(x: Vector):Double
+
+  override def evaluate(x: Vector, y: Vector): Double =
+    this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze)))
+
+  /**
+   * Calculates the AMISE (Asymptotic Mean Integrated Square Error)
+   * optimal bandwidth assignment by 'solve the equation plug in method'
+   **/
+  protected def optimalBandwidth(data: RDD[Vector]): Vector
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
new file mode 100644
index 0000000000000..4730bf5dc5854
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.mllib.linalg._
+/**
+ * Declares a trait Kernel which would serve
+ * as a base trait for all classes implementing
+ * Machine Learning Kernels.
+ *
+ **/
+
+trait Kernel {
+
+  /**
+   * Evaluates the value of the kernel given two
+   * vectorial parameters
+   *
+   * @param x a local Vector.
+   * @param y a local Vector.
+   *
+   * @return the value of the Kernel function.
+   *
+   * */
+  def evaluate(x: Vector, y:Vector): Double
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
new file mode 100644
index 0000000000000..cc449477ca173
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/**
+ * Standard Polynomial SVM Kernel
+ * of the form K(Xi,Xj) = (Xi^T * Xj + d)^r
+ */
+class PolynomialKernel(private var degree: Int,
+                       private var offset: Double)
+  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{
+
+  def setDegree(d: Int): Unit = {
+    this.degree = d
+  }
+
+  def setOffset(o: Int): Unit = {
+    this.offset = o
+  }
+
+  override def evaluate(x: linalg.Vector, y: linalg.Vector): Double =
+    Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree)
+
+  override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+                                      length: Long):
+  KernelMatrix[RDD[((Int, Int), Double)]] =
+    SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
new file mode 100644
index 0000000000000..e6ff82033f493
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import breeze.linalg.{DenseVector, norm}
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/**
+ * Standard RBF Kernel of the form
+ * K(Xi,Xj) = exp(-||Xi - Xj||**2/2*bandwidth**2)
+ */
+
+class RBFKernel(private var bandwidth: Double)
+  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{
+
+  def setBandwidth(d: Double): Unit = {
+    this.bandwidth = d
+  }
+
+  override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = {
+    val diff: linalg.Vector = Vectors.fromBreeze(x.toBreeze - y.toBreeze)
+    Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2)))
+  }
+
+  override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+                                      length: Long):
+  KernelMatrix[RDD[((Int, Int), Double)]] =
+    SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
new file mode 100644
index 0000000000000..c5b70fc35ef37
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import breeze.linalg.{DenseVector}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.mllib.linalg
+import org.apache.spark.{SparkContext, Logging}
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+/**
+ * Defines an abstract class outlines the basic
+ * functionality requirements of an SVM Kernel
+ */
+abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
+
+  def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+                             length: Long): KernelMatrix[T]
+
+}
+
+/**
+ * Defines a global singleton object
+ * [[SVMKernel]] which has useful functions
+ * while working with [[RDD]] of [[LabeledPoint]]
+ *
+ * */
+object SVMKernel extends Logging with Serializable{
+
+  /**
+   * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]]
+   *
+   * @param data : An [[RDD]] of [[LabeledPoint]]
+   *
+   * @return An (Int, LabeledPoint) Key-Value RDD indexed
+   *         from 0 to data.count() - 1
+   * */
+  def indexedRDD(data: RDD[LabeledPoint]): RDD[(Int, LabeledPoint)] = {
+    val sc = data.context
+    val i = sc.accumulator(-1, "Raw Data Index")
+
+    data.map((point) => {
+      i+=1
+      (i.localValue, point)
+    })
+  }
+
+
+  /**
+   * This function constructs an [[SVMKernelMatrix]]
+   *
+   * @param mappedData The indexed [[RDD]] of [[LabeledPoint]]
+   * @param length Length of the indexed [[RDD]]
+   * @param eval A function which calculates the value of the Kernel
+   *             given two Vectors [[linalg.Vector]].
+   *
+   * @return An [[SVMKernelMatrix]] object.
+   *
+   * */
+  def buildSVMKernelMatrix(mappedData: RDD[(Int, LabeledPoint)],
+                           length: Long,
+                           eval: (linalg.Vector, linalg.Vector) =>  Double):
+  KernelMatrix[RDD[((Int, Int), Double)]] = {
+
+    logInfo("Constructing key-value representation of kernel matrix.")
+    logInfo("Dimension: " + length + " x " + length)
+
+    val labels = mappedData.map((p) => (p._1, p._2.label))
+    val kernel = mappedData.cartesian(mappedData)
+      .map((prod) => ((prod._1._1, prod._2._1),
+      eval(prod._1._2.features, prod._2._2.features)))
+    kernel.cache()
+    new SVMKernelMatrix(kernel, length, labels)
+  }
+
+}
+
+/**
+ * Defines a trait which outlines the basic
+ * functionality of Kernel Matrices.
+ * */
+trait KernelMatrix[T] extends Serializable{
+  protected val kernel: T
+  def buildFeatureMap(dimensions: Int): RDD[LabeledPoint]
+  def getKernelMatrix(): T = this.kernel
+}
+
+class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
+                      private val dimension: Long,
+                      private val labels: RDD[(Int, Double)])
+  extends KernelMatrix[RDD[((Int, Int), Double)]] with Logging with Serializable {
+
+  override def getKernelMatrix():RDD[((Int, Int), Double)] = this.kernel
+
+  /**
+   * Defines a function value which
+   * calculates the multiplication of
+   * the Kernel Matrix with a Breeze
+   * Vector and returns the result as a
+   * Breeze DenseVector.
+   * */
+  val multiplyKernelMatrixOn =
+    (v :breeze.linalg.DenseVector[Double]) => {
+      val vbr = kernel.context.broadcast(v)
+      v.mapPairs((i, _) => {
+        //Get row number i of kernel
+        val row = kernel.filter((point) => i == point._1._1)
+        //multiply with v
+        var sum = kernel.context.accumulator(0.00, "Multiplication product, vector")
+        row.foreach((rownum) => {
+          sum += rownum._2*vbr.value(rownum._1._2)
+        })
+        sum.value
+      })
+    }
+
+  /**
+   * Builds an approximate nonlinear feature map
+   * which corresponds to an SVM Kernel. This is
+   * done using the Nystrom method i.e. approximating
+   * the eigenvalues and eigenvectors of the Kernel
+   * matrix of a given RDD
+   *
+   * @param dimensions The effective number of dimensions
+   *                   to be calculated in the feature map
+   *
+   * @return An RDD containing the non linear feature map
+   *         of all the data points passed to the function.
+   *
+   * */
+  def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] = {
+
+
+    logInfo("Eigenvalue decomposition of the kernel matrix using ARPACK.")
+    val decomposition = EigenValueDecomposition
+      .symmetricEigs(
+        multiplyKernelMatrixOn,
+        dimension.toInt, dimensions,
+        0.0001, 300)
+
+    logInfo("Applying Nystrom formula to calculate feature map of kernel matrix")
+
+    //TODO: Comment here
+
+    val rows = kernel.groupBy((couple) => {
+      couple._1._1
+    })
+
+    val temp = labels.join(rows)
+
+    temp.map((datapoint) => {
+      val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i =>
+        val eigenvector = decomposition._2(::, i)
+        val eigenvalue = decomposition._1(i)
+        var acc = 0.0
+        datapoint._2._2.foreach((p) =>
+          acc += (p._2 * eigenvector(p._1._2)/Math.sqrt(eigenvalue))
+        )
+        acc
+      }
+      new LabeledPoint(datapoint._2._1, Vectors.fromBreeze(y))
+    })
+
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
new file mode 100644
index 0000000000000..8be61ee158f73
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.mllib.classification.SVMSuite
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.scalatest.FunSuite
+
+class KernelSuite extends FunSuite with MLlibTestSparkContext {
+  test("Testing evaluate function of Polynomial and RBF Functions"){
+
+    val nPoints = 100
+
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
+    val B = -1.5
+    val C = 1.0
+
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42)
+
+    val testRDD = sc.parallelize(testData)
+
+    val rbf = new RBFKernel(1.00)
+    val poly = new PolynomialKernel(2, 1.5)
+
+    val mappedData = SVMKernel.indexedRDD(testRDD)
+
+    val kernelMatrix1 = poly.buildKernelMatrixasRDD(mappedData, nPoints)
+    val kernelMatrix2 = rbf.buildKernelMatrixasRDD(mappedData, nPoints)
+
+    assert(mappedData.count() == nPoints)
+    assert(kernelMatrix1.getKernelMatrix().filter((point) =>
+      point._2.isNaN || point._2.isInfinite)
+      .count() == 0)
+    assert(kernelMatrix2.getKernelMatrix().filter((point) =>
+      point._2.isNaN || point._2.isInfinite)
+      .count() == 0)
+
+  }
+
+  test("Testing building of feature map from the kernel matrix"){
+    val nPoints = 100
+
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
+    val B = -1.5
+    val C = 1.0
+
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42)
+
+    val testRDD = sc.parallelize(testData, 2)
+    testRDD.cache()
+
+    val rbf = new RBFKernel(1.00)
+    val poly = new PolynomialKernel(5, 1.5)
+    val mappedData = SVMKernel.indexedRDD(testRDD)
+
+    mappedData.cache()
+    val kernelMatrixpoly = poly.buildKernelMatrixasRDD(mappedData, nPoints)
+    val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(mappedData, nPoints)
+
+    assert(mappedData.count() == nPoints)
+    val mappedFeaturespoly = kernelMatrixpoly.buildFeatureMap(3)
+    val mappedFeaturesrbf = kernelMatrixRBF.buildFeatureMap(5)
+
+    assert(mappedFeaturespoly.filter((point) => point.features.size == 3).count() == 100)
+    assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100)
+
+  }
+}

From bc0209b1b1d0e72e1335bb349ec04dba2a306864 Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Wed, 21 Jan 2015 18:34:29 +0100
Subject: [PATCH 08/14] 1) Changes to class hierarchy 2) Commenting Nystrom
 approximation code

---
 .../spark/mllib/kernels/DensityKernel.scala   |  9 +-
 .../mllib/kernels/GaussianDensityKernel.scala | 82 +++++++++++++++++++
 .../spark/mllib/kernels/RBFKernel.scala       |  2 +-
 .../spark/mllib/kernels/SVMKernel.scala       | 19 ++++-
 4 files changed, 102 insertions(+), 10 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
index abab0778f588f..baec10ceceeaa 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
@@ -24,14 +24,9 @@ import org.apache.spark.rdd.RDD
  * implement various Multivariate Density
  * Kernels.
  */
-abstract class DensityKernel extends Kernel with Serializable{
-  protected var bandwidth: Vector
+trait DensityKernel extends Kernel with Serializable{
 
-  protected def setBandwidth(b: Vector): Unit = {
-    this.bandwidth = b
-  }
-
-  protected def eval(x: Vector):Double
+  def eval(x: Vector):Double
 
   override def evaluate(x: Vector, y: Vector): Double =
     this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze)))
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
new file mode 100644
index 0000000000000..e133ed69d78bd
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.kernels
+
+import breeze.linalg.{norm, DenseVector}
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
+
+
+class GaussianDensityKernel(protected var bandwidth: Vector)
+  extends DensityKernel with Logging {
+
+  def setBandwidth(b: linalg.Vector): Unit = {
+    this.bandwidth = b
+  }
+
+  override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth)
+
+  private def evalWithBandwidth(x: Vector, b: Vector): Double = {
+    val exp = scala.math.exp _
+    val pow = scala.math.pow _ _
+    val sqrt = scala.math.sqrt _
+    val Pi = scala.math.Pi
+
+    val buff = x.toBreeze
+
+    val normalizedbuff: breeze.linalg.DenseVector[Double] = DenseVector.tabulate(
+      b.size)(
+        (i) => buff(i)/b.apply(i)
+      )
+    exp(-1*pow(norm(normalizedbuff), 2)/2)/sqrt(Pi * 2)
+  }
+
+  //TODO: Implement derivative function
+  private def derivative(n: Int)(x: Vector): Vector = {
+    Vectors.zeros(x.size)
+  }
+
+  //TODO: Implement R integral
+  private def R(r: Int, pilot: Vector): Vector = {
+    Vectors.zeros(pilot.size)
+  }
+
+  //TODO: Implement mu integral
+  private val mu: Vector = Vectors.zeros(this.bandwidth.size)
+
+  override def optimalBandwidth(data: RDD[Vector]): Unit = {
+
+    //First calculate variance of all dimensions
+    val columnStats = Statistics.colStats(data)
+
+    val colvariance = columnStats.variance
+
+    //Now calculate the initial estimates of R(f'''') and R(f'''''')
+
+    //Use the earlier result to calculate h1 and h2 bandwidths for each
+    //dimension separately
+
+    //Use the Sheathon and Jones 1991 result to calculate
+    //the optimal bandwidth
+
+    //Vectors.fromBreeze(breeze.linalg.DenseVector.ones[Double](10))
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
index e6ff82033f493..fac11439fc192 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD
  */
 
 class RBFKernel(private var bandwidth: Double)
-  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{
+  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable {
 
   def setBandwidth(d: Double): Unit = {
     this.bandwidth = d
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
index c5b70fc35ef37..ff4d0d054e6bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -155,14 +155,29 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
 
     logInfo("Applying Nystrom formula to calculate feature map of kernel matrix")
 
-    //TODO: Comment here
-
+    /*
+    * Get row number i of the
+    * Kernel Matrix
+    * */
     val rows = kernel.groupBy((couple) => {
       couple._1._1
     })
 
+    /*
+    * Join the each row i with the
+    * target label for point i.
+    * */
     val temp = labels.join(rows)
 
+    /*
+    * Now for each data point,
+    * calculate n dimensions of the
+    * feature map where n is the number
+    * of eigenvalues/vectors obtained from
+    * the Eigen Decomposition.
+    *
+    * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, n, K(k, x)*eigenvector(i)(k))
+    * */
     temp.map((datapoint) => {
       val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i =>
         val eigenvector = decomposition._2(::, i)

From f8baf1e7da0ab6d71898f150d6b01cf3f38b2baf Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Tue, 27 Jan 2015 17:03:26 +0100
Subject: [PATCH 09/14] Added optimal bandwidth selection procedure for
 Gaussian Density Kernel

---
 .../spark/mllib/kernels/DensityKernel.scala   |  11 +-
 .../mllib/kernels/GaussianDensityKernel.scala | 181 +++++++++++++++---
 .../spark/mllib/kernels/KernelEstimator.scala |  41 ++++
 .../mllib/kernels/PolynomialKernel.scala      |   6 +-
 .../spark/mllib/kernels/RBFKernel.scala       |   6 +-
 .../spark/mllib/kernels/SVMKernel.scala       |  25 ++-
 .../spark/mllib/kernels/KernelSuite.scala     |  18 ++
 7 files changed, 232 insertions(+), 56 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
index baec10ceceeaa..6658c5343ace3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.mllib.kernels
 
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
 /**
@@ -24,17 +25,15 @@ import org.apache.spark.rdd.RDD
  * implement various Multivariate Density
  * Kernels.
  */
-trait DensityKernel extends Kernel with Serializable{
+trait DensityKernel extends Kernel with Serializable  {
 
   def eval(x: Vector):Double
 
   override def evaluate(x: Vector, y: Vector): Double =
     this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze)))
 
-  /**
-   * Calculates the AMISE (Asymptotic Mean Integrated Square Error)
-   * optimal bandwidth assignment by 'solve the equation plug in method'
-   **/
-  protected def optimalBandwidth(data: RDD[Vector]): Vector
+  protected def derivative(n: Int, x: Double): Double
 
+  protected val mu: Double
+  protected val r: Double
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
index e133ed69d78bd..6c7621e11a208 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
@@ -23,60 +23,179 @@ import org.apache.spark.mllib.linalg
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.rdd.RDD
+import breeze.numerics.{sqrt => brsqrt}
 
 
-class GaussianDensityKernel(protected var bandwidth: Vector)
-  extends DensityKernel with Logging {
+class GaussianDensityKernel
+  extends DensityKernel with KernelEstimator with Logging with Serializable {
+  private val exp = scala.math.exp _
+  private val pow = scala.math.pow _
+  private val sqrt = scala.math.sqrt _
+  private val Pi = scala.math.Pi
+  protected var bandwidth: Vector = Vectors.zeros(10)
 
-  def setBandwidth(b: linalg.Vector): Unit = {
-    this.bandwidth = b
-  }
-
-  override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth)
+  private def evalForDimension(x: Double, pilot: Double): Double =
+    exp(-1*pow(x/pilot, 2)/2)/sqrt(Pi * 2)
 
   private def evalWithBandwidth(x: Vector, b: Vector): Double = {
-    val exp = scala.math.exp _
-    val pow = scala.math.pow _ _
-    val sqrt = scala.math.sqrt _
-    val Pi = scala.math.Pi
-
+    assert(x.size == b.size,
+      "Dimensions of vector x and the bandwidth of the kernel must match")
     val buff = x.toBreeze
-
+    val bw = b.toBreeze
     val normalizedbuff: breeze.linalg.DenseVector[Double] = DenseVector.tabulate(
-      b.size)(
-        (i) => buff(i)/b.apply(i)
+      bw.size)(
+        (i) => buff(i)/bw(i)
       )
-    exp(-1*pow(norm(normalizedbuff), 2)/2)/sqrt(Pi * 2)
+    exp(-1*pow(norm(normalizedbuff), 2)/2)/pow(sqrt(Pi * 2), b.size)
   }
 
-  //TODO: Implement derivative function
-  private def derivative(n: Int)(x: Vector): Vector = {
-    Vectors.zeros(x.size)
+  /*
+  * Calculate the value of the hermite polynomials 
+  * tail recursively. This is needed to calculate 
+  * the Gaussian derivatives at a point x.
+  * */
+  private def hermite(n: Int, x: Double): Double = {
+    def hermiteHelper(k: Int, x: Double, a: Double, b: Double): Double =
+      k match {
+        case 0 => a
+        case 1 => b
+        case _ => hermiteHelper(k-1, x, b, x*b - (k-1)*a)
+      }
+    hermiteHelper(n, x, 1, x)
   }
 
-  //TODO: Implement R integral
-  private def R(r: Int, pilot: Vector): Vector = {
-    Vectors.zeros(pilot.size)
+  def setBandwidth(b: linalg.Vector): Unit = {
+    this.bandwidth = b
   }
 
-  //TODO: Implement mu integral
-  private val mu: Vector = Vectors.zeros(this.bandwidth.size)
+  override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth)
+
 
+  /**
+   * Calculates the derivative at point x for the Gaussian
+   * Density Kernel, for only one dimension.
+   *
+   * @param n The number of times the gaussian has to be differentiated
+   * @param x The point x at which the derivative has to evaluated
+   * @return The value of the nth derivative at the point x
+   * */
+  override def derivative(n: Int, x: Double): Double = {
+    (1/sqrt(2*Pi))*(1/pow(-1.0,n))*exp(-1*pow(x,2)/2)*hermite(n, x)
+  }
+
+  /**
+   * Implementation of the estimator for the R integral
+   * for a multivariate Gaussian Density Kernel.
+   * Evaluates R(D_r(f(x))).
+   *
+   * @param r the degree of the derivative of the kernel
+   *
+   * @param N The size of the original data set from which
+   *          kernel matrix [[RDD]] was constructed.
+   *
+   * @param pilot The pilot bandwidth to be used to calculate
+   *              the kernel values. (Note that we have not calculated
+   *              the AMISE bandwidth yet and we use this estimator
+   *              as a means to get the AMISE bandwidth)
+   *
+   * @param kernel The RDD containing the kernel matrix
+   *               consisting of pairs Xi - Xj, where Xi and Xj
+   *               are drawn from the original data set.
+   *
+   * @return R the estimated value of the integral of the square
+   *         of the rth derivative of the kernel over the Real domain.
+   * */
+  override protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
+                           kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = {
+
+
+    /*
+    * Apply map to get values of the derivative of the kernel
+    * at various point pairs.
+    * */
+    val kernelNormalized = kernel.map((couple) =>
+      (couple._1, Vectors.fromBreeze(DenseVector.tabulate(pilot.size)
+        ((i) => (1/(pow(N, 2)*pow(pilot(i), r + 1)))*
+          this.derivative(r, couple._2.toBreeze(i)/pilot(i)))
+      )))
+
+    /*
+    * Sum up all the individual values to get the estimated
+    * value of the integral
+    * */
+    val integralvalue = kernelNormalized.reduce((a,b) =>
+      ((0,0), Vectors.fromBreeze(a._2.toBreeze + b._2.toBreeze)))
+
+    integralvalue._2.toBreeze
+  }
+
+  override protected val mu = (1/4)*(1/sqrt(Pi))
+  override protected val r = (1/2)*(1/sqrt(Pi))
+
+  /**
+   * Use the Sheather and Jones plug-in
+   * method to calculate the optimal bandwidth
+   * http://bit.ly/1EoBY7q
+   *
+   * */
   override def optimalBandwidth(data: RDD[Vector]): Unit = {
+    val dataSize: Long = data.count()
 
     //First calculate variance of all dimensions
     val columnStats = Statistics.colStats(data)
+    // And then the standard deviation
+    val colvar = columnStats.variance.toBreeze
+    val colstd = colvar.map((v) => sqrt(v))
+
+    //Now calculate the initial estimates of R(f^6) and R(f^8)
+
+    /*val Rf6: DenseVector[Double] = DenseVector.tabulate(colstd.size)(
+      (i) => -15.0*pow(colstd(i), -7.0)/(16*sqrt(Pi)))*/
+
+    val Rf8: DenseVector[Double] = DenseVector.tabulate(colstd.size)(
+      (i) => 105*pow(colstd(i), -9.0)/(32*sqrt(Pi)))
+
+    /*
+    * Use the earlier result to calculate
+    * h1 and h2 bandwidths for each dimension
+    * */
+
+    /*val h1: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) =>
+      pow(-2*this.derivative(4, 0.0)/(dataSize*this.mu*Rf6(i)), 1/7))*/
+    val h2: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) =>
+      pow(-2*this.derivative(6, 0.0)/(dataSize*this.mu*Rf8(i)), 1/9))
+
+
+    /*
+    * Use h1 and h2 to calculate more
+    * refined estimates of R(f^6) and R(f^8)
+    * */
+
+    //Get an 0-indexed version of the original data set
+    val mappedData = SVMKernel.indexedRDD(data)
+
+    /*
+    * Apply cartesian product on the indexed data set
+    * and then map it to a RDD of type [(i,j), Xi - Xj]
+    * */
+    val kernel = mappedData.cartesian(mappedData)
+      .map((prod) => ((prod._1._1, prod._2._1),
+      Vectors.fromBreeze(prod._1._2.toBreeze -
+        prod._2._2.toBreeze))
+      )
+    kernel.cache()
+
 
-    val colvariance = columnStats.variance
+    val newRf6: breeze.linalg.Vector[Double] = this.R(8, dataSize, h2, kernel)
 
-    //Now calculate the initial estimates of R(f'''') and R(f'''''')
+    val hAMSE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) =>
+      pow((-2*this.derivative(4, 0.0))/(dataSize*this.mu*newRf6(i)), 1/7))
 
-    //Use the earlier result to calculate h1 and h2 bandwidths for each
-    //dimension separately
+    val newRf4: breeze.linalg.Vector[Double] = this.R(4, dataSize, hAMSE, kernel)
 
-    //Use the Sheathon and Jones 1991 result to calculate
-    //the optimal bandwidth
+    val hAMISE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) =>
+      pow(this.r/(dataSize*this.mu*this.mu*newRf4(i)), 1/5))
 
-    //Vectors.fromBreeze(breeze.linalg.DenseVector.ones[Double](10))
+    this.bandwidth = Vectors.fromBreeze(hAMISE)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
new file mode 100644
index 0000000000000..dedbd4c3a6264
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.kernels
+
+import org.apache.spark.Logging
+import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
+
+/**
+ * Trait defining the basic behavior
+ * of a Kernel density estimator
+ */
+trait KernelEstimator extends Logging {
+
+  protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
+                  kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double]
+
+
+  /**
+   * Calculate the AMISE (Asymptotic Mean Integrated Square Error)
+   * optimal bandwidth assignment by 'solve the equation plug in method'
+   **/
+  def optimalBandwidth(data: RDD[Vector]): Unit
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
index cc449477ca173..51abfad97c060 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
@@ -27,7 +27,7 @@ import org.apache.spark.rdd.RDD
  */
 class PolynomialKernel(private var degree: Int,
                        private var offset: Double)
-  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable{
+  extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable{
 
   def setDegree(d: Int): Unit = {
     this.degree = d
@@ -40,8 +40,8 @@ class PolynomialKernel(private var degree: Int,
   override def evaluate(x: linalg.Vector, y: linalg.Vector): Double =
     Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree)
 
-  override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+  override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
                                       length: Long):
-  KernelMatrix[RDD[((Int, Int), Double)]] =
+  KernelMatrix[RDD[((Long, Long), Double)]] =
     SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
index fac11439fc192..d5c9285e8c394 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
@@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD
  */
 
 class RBFKernel(private var bandwidth: Double)
-  extends SVMKernel[RDD[((Int, Int), Double)]] with Logging with Serializable {
+  extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable {
 
   def setBandwidth(d: Double): Unit = {
     this.bandwidth = d
@@ -40,9 +40,9 @@ class RBFKernel(private var bandwidth: Double)
     Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2)))
   }
 
-  override def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+  override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
                                       length: Long):
-  KernelMatrix[RDD[((Int, Int), Double)]] =
+  KernelMatrix[RDD[((Long, Long), Double)]] =
     SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
index ff4d0d054e6bb..74bec1050f913 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -16,8 +16,7 @@
  */
 package org.apache.spark.mllib.kernels
 
-import breeze.linalg.{DenseVector}
-import org.apache.spark.annotation.DeveloperApi
+import breeze.linalg.DenseVector
 import org.apache.spark.mllib.linalg
 import org.apache.spark.{SparkContext, Logging}
 import org.apache.spark.mllib.linalg._
@@ -29,7 +28,7 @@ import org.apache.spark.rdd.RDD
  */
 abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
 
-  def buildKernelMatrixasRDD(mappedData: RDD[(Int, LabeledPoint)],
+  def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
                              length: Long): KernelMatrix[T]
 
 }
@@ -50,9 +49,9 @@ object SVMKernel extends Logging with Serializable{
    * @return An (Int, LabeledPoint) Key-Value RDD indexed
    *         from 0 to data.count() - 1
    * */
-  def indexedRDD(data: RDD[LabeledPoint]): RDD[(Int, LabeledPoint)] = {
+  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = {
     val sc = data.context
-    val i = sc.accumulator(-1, "Raw Data Index")
+    val i: org.apache.spark.Accumulator[Long] = sc.accumulator(-1, "Raw Data Index")
 
     data.map((point) => {
       i+=1
@@ -72,10 +71,10 @@ object SVMKernel extends Logging with Serializable{
    * @return An [[SVMKernelMatrix]] object.
    *
    * */
-  def buildSVMKernelMatrix(mappedData: RDD[(Int, LabeledPoint)],
+  def buildSVMKernelMatrix(mappedData: RDD[(Long, LabeledPoint)],
                            length: Long,
                            eval: (linalg.Vector, linalg.Vector) =>  Double):
-  KernelMatrix[RDD[((Int, Int), Double)]] = {
+  KernelMatrix[RDD[((Long, Long), Double)]] = {
 
     logInfo("Constructing key-value representation of kernel matrix.")
     logInfo("Dimension: " + length + " x " + length)
@@ -100,12 +99,12 @@ trait KernelMatrix[T] extends Serializable{
   def getKernelMatrix(): T = this.kernel
 }
 
-class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
+class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)],
                       private val dimension: Long,
-                      private val labels: RDD[(Int, Double)])
-  extends KernelMatrix[RDD[((Int, Int), Double)]] with Logging with Serializable {
+                      private val labels: RDD[(Long, Double)])
+  extends KernelMatrix[RDD[((Long, Long), Double)]] with Logging with Serializable {
 
-  override def getKernelMatrix():RDD[((Int, Int), Double)] = this.kernel
+  override def getKernelMatrix():RDD[((Long, Long), Double)] = this.kernel
 
   /**
    * Defines a function value which
@@ -123,7 +122,7 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
         //multiply with v
         var sum = kernel.context.accumulator(0.00, "Multiplication product, vector")
         row.foreach((rownum) => {
-          sum += rownum._2*vbr.value(rownum._1._2)
+          sum += rownum._2*vbr.value(rownum._1._2.toInt)
         })
         sum.value
       })
@@ -184,7 +183,7 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Int, Int), Double)],
         val eigenvalue = decomposition._1(i)
         var acc = 0.0
         datapoint._2._2.foreach((p) =>
-          acc += (p._2 * eigenvector(p._1._2)/Math.sqrt(eigenvalue))
+          acc += (p._2 * eigenvector(p._1._2.toInt)/Math.sqrt(eigenvalue))
         )
         acc
       }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
index 8be61ee158f73..6ac6ae9b33e18 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -81,4 +81,22 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100)
 
   }
+
+  test("Testing optimal bandwidth calculation on Gaussian Kernel"){
+    val nPoints = 100
+
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
+    val B = -1.5
+    val C = 1.0
+
+    val testData = SVMSuite.generateSVMInput(A, Array[Double](B, C), nPoints, 42)
+
+    val testRDD = sc.parallelize(testData, 2)
+    val newtestRDD = testRDD.map((p) => p.features)
+    newtestRDD.cache()
+    val kern = new GaussianDensityKernel()
+    kern.optimalBandwidth(newtestRDD)
+    assert(kern.eval(newtestRDD.first()) != Double.NaN)
+  }
 }

From 1fedafdbe7794dcd9a34f6a7317b1ff5e936c1f1 Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Thu, 29 Jan 2015 14:42:53 +0100
Subject: [PATCH 10/14] Entropy based subset selection done, unit tests passing

---
 .../spark/mllib/kernels/SVMKernel.scala       |  20 ++-
 .../mllib/prototype/EntropyMeasure.scala      |  49 +++++++
 .../mllib/prototype/EntropySelector.scala     | 127 ++++++++++++++++++
 .../spark/mllib/prototype/Measure.scala       |  28 ++++
 .../prototype/QuadraticRenyiEntropy.scala     |  60 +++++++++
 .../mllib/prototype/SubsetSelector.scala      |  28 ++++
 .../spark/mllib/kernels/KernelSuite.scala     |  20 ++-
 7 files changed, 319 insertions(+), 13 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
index 74bec1050f913..5321e55a07c70 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -39,7 +39,7 @@ abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
  * while working with [[RDD]] of [[LabeledPoint]]
  *
  * */
-object SVMKernel extends Logging with Serializable{
+object SVMKernel extends Logging with Serializable {
 
   /**
    * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]]
@@ -49,15 +49,7 @@ object SVMKernel extends Logging with Serializable{
    * @return An (Int, LabeledPoint) Key-Value RDD indexed
    *         from 0 to data.count() - 1
    * */
-  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = {
-    val sc = data.context
-    val i: org.apache.spark.Accumulator[Long] = sc.accumulator(-1, "Raw Data Index")
-
-    data.map((point) => {
-      i+=1
-      (i.localValue, point)
-    })
-  }
+  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = data.zipWithIndex().map((p) => (p._2, p._1))
 
 
   /**
@@ -87,13 +79,19 @@ object SVMKernel extends Logging with Serializable{
     new SVMKernelMatrix(kernel, length, labels)
   }
 
+  def zipVectorsWithLabels(mappedData: RDD[(Long, Vector)],
+                           labels: RDD[(Long, Double)]):
+  RDD[LabeledPoint] = mappedData.join(labels).map((point) =>
+    new LabeledPoint(point._2._2, point._2._1))
+
+
 }
 
 /**
  * Defines a trait which outlines the basic
  * functionality of Kernel Matrices.
  * */
-trait KernelMatrix[T] extends Serializable{
+trait KernelMatrix[T] extends Serializable {
   protected val kernel: T
   def buildFeatureMap(dimensions: Int): RDD[LabeledPoint]
   def getKernelMatrix(): T = this.kernel
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
new file mode 100644
index 0000000000000..73bcfa3aab30e
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.prototype
+
+import org.apache.spark.mllib.kernels.DensityKernel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.Vector
+
+/**
+ * Models a general entropy measure.
+ * Any entropy measure would require a
+ * probability distribution
+ */
+abstract class EntropyMeasure extends Measure[Vector]
+with Serializable{
+
+  protected val density: DensityKernel
+
+  /**
+   * Given a probability distribution for
+   * the data set, calculate the entropy of
+   * the data set with respect to the given
+   * distribution.
+   *
+   * @param data The data set whose entropy is
+   *             required.
+   *
+   * @return The entropy of the data set.
+   * */
+
+  def entropy[K](data: RDD[(K, Vector)]): Double
+
+  override def evaluate[K](data: RDD[(K, Vector)]): Double = this.entropy(data)
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
new file mode 100644
index 0000000000000..1543919c1fe53
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.prototype
+
+import org.apache.spark.{SparkContext, Logging}
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.rdd.RDD
+
+/**
+ * Basic skeleton of an entropy based
+ * subset selector
+ */
+abstract class EntropySelector
+  extends SubsetSelector[(Long, Vector)] with Serializable
+  with Logging {
+  protected val measure: EntropyMeasure
+  protected val delta: Double
+  protected val MAX_ITERATIONS: Int
+}
+
+class GreedyEntropySelector(m: EntropyMeasure,
+                            del: Double = 0.0001,
+                            max: Int = 5000)
+  extends EntropySelector with Serializable
+  with Logging {
+
+  override protected val measure: EntropyMeasure = m
+  override protected val delta: Double = del
+  override protected val MAX_ITERATIONS: Int =  max
+
+  override def selectPrototypes(data: RDD[(Long, Vector)],
+                                M: Int): RDD[(Long, Vector)] = {
+
+    val context = data.context
+
+    /*
+    * Draw an initial sample of M points
+    * from data without replacement.
+    *
+    * Define a working set which we
+    * will use as a prototype set to
+    * to each iteration
+    * */
+
+    val workingset = data.keys.takeSample(false, M)
+
+    val r = scala.util.Random
+    var it: Int = 0
+
+    //All the elements not in the working set
+    var newDataset: RDD[Long] = data.keys.filter((p) => !workingset.contains(p))
+    //Existing best value of the entropy
+    var oldEntropy: Double = this.measure.evaluate(data.filter((point) =>
+      workingset.contains(point._1)))
+    //Store the value of entropy after an element swap
+    var newEntropy: Double = 0.0
+    var d: Double = Double.NegativeInfinity
+    var rand: Int = 0
+    do {
+      /*
+       * Randomly select a point from
+       * the working set as well as data
+       * and then swap them.
+       * */
+      rand = r.nextInt(workingset.length - 1)
+      val point1 = workingset.apply(rand)
+
+      val point2 = newDataset.takeSample(false, 1).apply(0)
+
+      //Update the working set
+      workingset(rand) = point2
+      //Calculate the new entropy
+      newEntropy = this.measure.evaluate(data.filter((p) =>
+        workingset.contains(p._1)))
+
+      /*
+      * Calculate the change in entropy,
+      * if it has improved then keep the
+      * swap, otherwise revert to existing
+      * working set.
+      * */
+      d = newEntropy - oldEntropy
+
+      if(d < 0) {
+        /*
+        * Improvement in entropy so
+        * keep the updated working set
+        * as it is and update the
+        * variable 'newDataset'
+        * */
+        oldEntropy = newEntropy
+        newDataset = data.keys.filter((p) => !workingset.contains(p))
+      } else {
+        /*
+        * No improvement in entropy
+        * so revert the working set
+        * to its initial state. Leave
+        * the variable newDataset as
+        * it is.
+        * */
+        workingset(rand) = point1
+      }
+
+      it += 1
+    } while(math.abs(d) >= this.delta &&
+      it <= this.MAX_ITERATIONS)
+
+    //Time to return the final working set
+    data.filter((p) => workingset.contains(p._1))
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala
new file mode 100644
index 0000000000000..80d466fb18ee3
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/Measure.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.prototype
+
+import org.apache.spark.rdd.RDD
+
+/**
+ * Trait which outlines basic behavior
+ * of a subset utility measure.
+ */
+trait Measure[T] {
+  def evaluate[K](data: RDD[(K, T)]): Double
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
new file mode 100644
index 0000000000000..d2fcbaef381e8
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.mllib.prototype
+
+import breeze.linalg.DenseVector
+import org.apache.spark.Logging
+import org.apache.spark.mllib.kernels.DensityKernel
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+
+/**
+ * Implements the quadratic Renyi Entropy
+ */
+class QuadraticRenyiEntropy(dist: DensityKernel)
+  extends EntropyMeasure with Serializable with Logging {
+
+  val log_e = scala.math.log _
+  val sqrt = scala.math.sqrt _
+
+  override protected val density: DensityKernel = dist
+
+  /**
+   * Calculate the quadratic Renyi entropy
+   * within a distribution specific
+   * proportionality constant. This can
+   * be used to compare the entropy values of
+   * different sets of data on the same
+   * distribution.
+   *
+   * @param data The data set whose entropy is
+   *             required.
+   * @return The entropy of the dataset assuming
+   *         it is distributed as given by the value
+   *         parameter 'density'.
+   * */
+
+  override def entropy[K](data: RDD[(K, Vector)]): Double = {
+    val dim = data.first()._2.size
+    val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2))
+    -1*log_e(data.cartesian(data).map((couple) =>
+      density.evaluate(
+        Vectors.fromBreeze(couple._1._2.toBreeze :/ root_two),
+        Vectors.fromBreeze(couple._2._2.toBreeze :/ root_two)
+      )).reduce((a,b) => a + b))
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala
new file mode 100644
index 0000000000000..c96bcb0dd3a3e
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/SubsetSelector.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.prototype
+
+import org.apache.spark.rdd.RDD
+
+/**
+ * Defines the characteristics of
+ * a subset selector
+ */
+trait SubsetSelector[T] extends Serializable{
+  def selectPrototypes(data: RDD[T], M: Int): RDD[T]
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
index 6ac6ae9b33e18..26f163ada25c2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -16,7 +16,9 @@
  */
 package org.apache.spark.mllib.kernels
 
+import breeze.linalg.norm
 import org.apache.spark.mllib.classification.SVMSuite
+import org.apache.spark.mllib.prototype.{QuadraticRenyiEntropy, GreedyEntropySelector}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.scalatest.FunSuite
 
@@ -82,8 +84,9 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
 
   }
 
-  test("Testing optimal bandwidth calculation on Gaussian Kernel"){
-    val nPoints = 100
+  test("Testing optimal bandwidth calculation on Gaussian Kernel" +
+    " and maximum entropy subset selection"){
+    val nPoints = 10000
 
     // NOTE: Intercept should be small for generating equal 0s and 1s
     val A = 0.01
@@ -98,5 +101,18 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     val kern = new GaussianDensityKernel()
     kern.optimalBandwidth(newtestRDD)
     assert(kern.eval(newtestRDD.first()) != Double.NaN)
+
+    val newIndexedRDD = SVMKernel.indexedRDD(newtestRDD)
+    newIndexedRDD.cache()
+    newtestRDD.unpersist()
+
+    val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern)
+    val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
+
+    val subsetRDD = subsetsel.selectPrototypes(
+      newIndexedRDD,
+      100)
+
+    assert(subsetRDD.count() == 100)
   }
 }

From 6b8d8db8466b8795f8710d8861fc8f1ed541543f Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Sun, 1 Feb 2015 00:26:03 +0100
Subject: [PATCH 11/14] 1) Optimized code for feature map extraction, kernel
 matrix multiplication 2) Code indentation changes

---
 .../spark/mllib/kernels/DensityKernel.scala   |  11 +-
 .../mllib/kernels/GaussianDensityKernel.scala |  25 +--
 .../apache/spark/mllib/kernels/Kernel.scala   |   4 +-
 .../spark/mllib/kernels/KernelEstimator.scala |   9 +-
 .../mllib/kernels/PolynomialKernel.scala      |  20 +-
 .../spark/mllib/kernels/RBFKernel.scala       |  18 +-
 .../spark/mllib/kernels/SVMKernel.scala       | 205 ++++++++++--------
 .../mllib/prototype/EntropyMeasure.scala      |   9 +-
 .../mllib/prototype/EntropySelector.scala     |  31 +--
 .../prototype/QuadraticRenyiEntropy.scala     |  18 +-
 .../spark/mllib/kernels/KernelSuite.scala     |  80 ++++++-
 11 files changed, 258 insertions(+), 172 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
index 6658c5343ace3..7f8b7a06af7cc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
@@ -18,7 +18,6 @@ package org.apache.spark.mllib.kernels
 
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.rdd.RDD
 
 /**
  * Abstract class which can be extended to
@@ -26,14 +25,14 @@ import org.apache.spark.rdd.RDD
  * Kernels.
  */
 trait DensityKernel extends Kernel with Serializable  {
+  protected val mu: Double
+  protected val r: Double
 
   def eval(x: Vector):Double
 
-  override def evaluate(x: Vector, y: Vector): Double =
-    this.eval(Vectors.fromBreeze(x.toBreeze.-=(y.toBreeze)))
+  override def evaluate(x: LabeledPoint, y: LabeledPoint): Double =
+    this.eval(Vectors.fromBreeze(x.features.toBreeze.-=(y.features.toBreeze)))
 
   protected def derivative(n: Int, x: Double): Double
 
-  protected val mu: Double
-  protected val r: Double
-}
+  }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
index 6c7621e11a208..6de1c51c89df4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
@@ -19,20 +19,22 @@ package org.apache.spark.mllib.kernels
 
 import breeze.linalg.{norm, DenseVector}
 import org.apache.spark.Logging
-import org.apache.spark.mllib.linalg
 import org.apache.spark.mllib.linalg.{Vectors, Vector}
 import org.apache.spark.mllib.stat.Statistics
 import org.apache.spark.rdd.RDD
-import breeze.numerics.{sqrt => brsqrt}
-
 
 class GaussianDensityKernel
-  extends DensityKernel with KernelEstimator with Logging with Serializable {
+  extends DensityKernel
+  with KernelEstimator
+  with Logging
+  with Serializable {
   private val exp = scala.math.exp _
   private val pow = scala.math.pow _
   private val sqrt = scala.math.sqrt _
   private val Pi = scala.math.Pi
   protected var bandwidth: Vector = Vectors.zeros(10)
+  override protected val mu = (1/4)*(1/sqrt(Pi))
+  override protected val r = (1/2)*(1/sqrt(Pi))
 
   private def evalForDimension(x: Double, pilot: Double): Double =
     exp(-1*pow(x/pilot, 2)/2)/sqrt(Pi * 2)
@@ -64,12 +66,11 @@ class GaussianDensityKernel
     hermiteHelper(n, x, 1, x)
   }
 
-  def setBandwidth(b: linalg.Vector): Unit = {
+  def setBandwidth(b: Vector): Unit = {
     this.bandwidth = b
   }
 
-  override def eval(x: linalg.Vector) = evalWithBandwidth(x, this.bandwidth)
-
+  override def eval(x: Vector) = evalWithBandwidth(x, this.bandwidth)
 
   /**
    * Calculates the derivative at point x for the Gaussian
@@ -105,9 +106,9 @@ class GaussianDensityKernel
    * @return R the estimated value of the integral of the square
    *         of the rth derivative of the kernel over the Real domain.
    * */
-  override protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
-                           kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = {
-
+  override protected def R(
+      r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
+      kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double] = {
 
     /*
     * Apply map to get values of the derivative of the kernel
@@ -129,9 +130,6 @@ class GaussianDensityKernel
     integralvalue._2.toBreeze
   }
 
-  override protected val mu = (1/4)*(1/sqrt(Pi))
-  override protected val r = (1/2)*(1/sqrt(Pi))
-
   /**
    * Use the Sheather and Jones plug-in
    * method to calculate the optimal bandwidth
@@ -185,7 +183,6 @@ class GaussianDensityKernel
       )
     kernel.cache()
 
-
     val newRf6: breeze.linalg.Vector[Double] = this.R(8, dataSize, h2, kernel)
 
     val hAMSE: breeze.linalg.Vector[Double] = DenseVector.tabulate(colstd.size)((i) =>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
index 4730bf5dc5854..3d945fa6e22b5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
@@ -17,6 +17,8 @@
 package org.apache.spark.mllib.kernels
 
 import org.apache.spark.mllib.linalg._
+import org.apache.spark.mllib.regression.LabeledPoint
+
 /**
  * Declares a trait Kernel which would serve
  * as a base trait for all classes implementing
@@ -36,5 +38,5 @@ trait Kernel {
    * @return the value of the Kernel function.
    *
    * */
-  def evaluate(x: Vector, y:Vector): Double
+  def evaluate(x: LabeledPoint, y: LabeledPoint): Double
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
index dedbd4c3a6264..03cc504bc34c3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.mllib.kernels
 
 import org.apache.spark.Logging
-import org.apache.spark.mllib.linalg.{DenseVector, Vectors, Vector}
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.rdd.RDD
 
 /**
@@ -28,9 +27,9 @@ import org.apache.spark.rdd.RDD
  */
 trait KernelEstimator extends Logging {
 
-  protected def R(r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
-                  kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double]
-
+  protected def R(
+      r: Int, N: Long, pilot: breeze.linalg.Vector[Double],
+      kernel: RDD[((Long, Long), Vector)]): breeze.linalg.Vector[Double]
 
   /**
    * Calculate the AMISE (Asymptotic Mean Integrated Square Error)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
index 51abfad97c060..828aca0b48570 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/PolynomialKernel.scala
@@ -17,7 +17,6 @@
 package org.apache.spark.mllib.kernels
 
 import org.apache.spark.Logging
-import org.apache.spark.mllib.linalg
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
@@ -25,9 +24,12 @@ import org.apache.spark.rdd.RDD
  * Standard Polynomial SVM Kernel
  * of the form K(Xi,Xj) = (Xi^T * Xj + d)^r
  */
-class PolynomialKernel(private var degree: Int,
-                       private var offset: Double)
-  extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable{
+class PolynomialKernel(
+    private var degree: Int,
+    private var offset: Double)
+  extends SVMKernel[RDD[((Long, Long), Double)]]
+  with Logging
+  with Serializable{
 
   def setDegree(d: Int): Unit = {
     this.degree = d
@@ -37,11 +39,11 @@ class PolynomialKernel(private var degree: Int,
     this.offset = o
   }
 
-  override def evaluate(x: linalg.Vector, y: linalg.Vector): Double =
-    Math.pow(x.toBreeze dot y.toBreeze + this.offset, this.degree)
+  override def evaluate(x: LabeledPoint, y: LabeledPoint): Double =
+    Math.pow(x.features.toBreeze dot y.features.toBreeze + this.offset, this.degree)
 
-  override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
-                                      length: Long):
-  KernelMatrix[RDD[((Long, Long), Double)]] =
+  override def buildKernelMatrixasRDD(
+      mappedData: RDD[(Long, LabeledPoint)],
+      length: Long): KernelMatrix[RDD[((Long, Long), Double)]] =
     SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
index d5c9285e8c394..3b78b159d43b1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/RBFKernel.scala
@@ -16,10 +16,8 @@
  */
 package org.apache.spark.mllib.kernels
 
-import breeze.linalg.{DenseVector, norm}
 import org.apache.spark.Logging
-import org.apache.spark.mllib.linalg
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
@@ -29,20 +27,22 @@ import org.apache.spark.rdd.RDD
  */
 
 class RBFKernel(private var bandwidth: Double)
-  extends SVMKernel[RDD[((Long, Long), Double)]] with Logging with Serializable {
+  extends SVMKernel[RDD[((Long, Long), Double)]]
+  with Logging
+  with Serializable {
 
   def setBandwidth(d: Double): Unit = {
     this.bandwidth = d
   }
 
-  override def evaluate(x: linalg.Vector, y: linalg.Vector): Double = {
-    val diff: linalg.Vector = Vectors.fromBreeze(x.toBreeze - y.toBreeze)
+  override def evaluate(x: LabeledPoint, y: LabeledPoint): Double = {
+    val diff: Vector = Vectors.fromBreeze(x.features.toBreeze - y.features.toBreeze)
     Math.exp(-1*Math.pow(Vectors.norm(diff, 2.0), 2)/(2*Math.pow(bandwidth, 2)))
   }
 
-  override def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
-                                      length: Long):
-  KernelMatrix[RDD[((Long, Long), Double)]] =
+  override def buildKernelMatrixasRDD(
+      mappedData: RDD[(Long, LabeledPoint)],
+      length: Long): KernelMatrix[RDD[((Long, Long), Double)]] =
     SVMKernel.buildSVMKernelMatrix(mappedData, length, this.evaluate)
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
index 5321e55a07c70..a4a11dc53e2d1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -16,9 +16,8 @@
  */
 package org.apache.spark.mllib.kernels
 
-import breeze.linalg.DenseVector
-import org.apache.spark.mllib.linalg
-import org.apache.spark.{SparkContext, Logging}
+import breeze.linalg.{DenseVector, DenseMatrix}
+import org.apache.spark.Logging
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
@@ -28,9 +27,67 @@ import org.apache.spark.rdd.RDD
  */
 abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
 
-  def buildKernelMatrixasRDD(mappedData: RDD[(Long, LabeledPoint)],
-                             length: Long): KernelMatrix[T]
+  /**
+   * Build the kernel matrix of the prototype vectors
+   *
+   * @param mappedData The prototype vectors/points
+   *
+   * @param length The number of points
+   *
+   * @return A [[KernelMatrix]] object
+   *
+   *
+   * */
+  def buildKernelMatrixasRDD(
+      mappedData: RDD[(Long, LabeledPoint)],
+      length: Long): KernelMatrix[T]
+
+  /**
+   * Builds an approximate nonlinear feature map
+   * which corresponds to an SVM Kernel. This is
+   * done using the Nystrom method i.e. approximating
+   * the eigenvalues and eigenvectors of the Kernel
+   * matrix of a given RDD
+   *
+   * For each data point,
+   * calculate m dimensions of the
+   * feature map where m is the number
+   * of eigenvalues/vectors obtained from
+   * the Eigen Decomposition.
+   *
+   * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, m, K(k, x)*eigenvector(i)(k))
+   *
+   * @param decomposition The Eigenvalue decomposition calculated
+   *                      from the kernel matrix of the prototype
+   *                      subset.
+   * @param prototypes The prototype subset.
+   *
+   * @param data  The dataset [[RDD]] on which the feature map
+   *              is to be applied.
+   *
+   * */
+  def featureMapping(decomposition: (DenseVector[Double], DenseMatrix[Double]))
+                    (prototypes: RDD[(Long, LabeledPoint)])
+                    (data: RDD[(Long, LabeledPoint)])
+  : RDD[(Long, LabeledPoint)] = {
+
+    logInfo("Calculating the Non Linear feature map of data set")
+
+    data.cartesian(prototypes)
+      .map((couple) => {
+      val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i =>
+        var eigenvector = 0.0
+        if (couple._2._1.toInt < decomposition._1.length) {
+          eigenvector = decomposition._2(couple._2._1.toInt, i)
+        }
 
+        val eigenvalue = decomposition._1(i)
+        this.evaluate(couple._1._2, couple._2._2) * eigenvector/Math.sqrt(eigenvalue)
+      }
+        (couple._1._1, (couple._1._2.label, y))
+      }).reduceByKey((veca, vecb) => (veca._1, veca._2 + vecb._2))
+      .map((p) => (p._1, new LabeledPoint(p._2._1, Vectors.fromBreeze(p._2._2))))
+  }
 }
 
 /**
@@ -41,6 +98,32 @@ abstract class SVMKernel[T] extends Kernel with Logging with Serializable {
  * */
 object SVMKernel extends Logging with Serializable {
 
+  /**
+   * Defines a function value which
+   * calculates the multiplication of
+   * the Kernel Matrix with a Breeze
+   * Vector and returns the result as a
+   * Breeze DenseVector.
+   * */
+  def multiplyKernelMatrixBy(kernel: RDD[((Long, Long), Double)])
+                            (v :breeze.linalg.DenseVector[Double]):
+  DenseVector[Double] = {
+      val vbr = kernel.context.broadcast(v)
+      val result: DenseVector[Double] =
+        DenseVector.tabulate(v.length)(
+          (i) => {
+            //Get row number i of kernel
+            val row = DenseVector.apply(kernel
+              .filter((point) => i == point._1._1)
+              .map((p) => p._2)
+              .collect())
+            //dot product with v
+            vbr.value.t * row
+          }
+        )
+      result
+    }
+
   /**
    * Returns an indexed [[RDD]] from a non indexed [[RDD]] of [[LabeledPoint]]
    *
@@ -49,8 +132,8 @@ object SVMKernel extends Logging with Serializable {
    * @return An (Int, LabeledPoint) Key-Value RDD indexed
    *         from 0 to data.count() - 1
    * */
-  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = data.zipWithIndex().map((p) => (p._2, p._1))
-
+  def indexedRDD[T](data: RDD[T]): RDD[(Long, T)] = 
+    data.zipWithIndex().map((p) => (p._2, p._1))
 
   /**
    * This function constructs an [[SVMKernelMatrix]]
@@ -58,14 +141,15 @@ object SVMKernel extends Logging with Serializable {
    * @param mappedData The indexed [[RDD]] of [[LabeledPoint]]
    * @param length Length of the indexed [[RDD]]
    * @param eval A function which calculates the value of the Kernel
-   *             given two Vectors [[linalg.Vector]].
+   *             given two Labeled Points [[LabeledPoint]].
    *
    * @return An [[SVMKernelMatrix]] object.
    *
    * */
-  def buildSVMKernelMatrix(mappedData: RDD[(Long, LabeledPoint)],
-                           length: Long,
-                           eval: (linalg.Vector, linalg.Vector) =>  Double):
+  def buildSVMKernelMatrix(
+      mappedData: RDD[(Long, LabeledPoint)],
+      length: Long,
+      eval: (LabeledPoint, LabeledPoint) =>  Double):
   KernelMatrix[RDD[((Long, Long), Double)]] = {
 
     logInfo("Constructing key-value representation of kernel matrix.")
@@ -74,17 +158,19 @@ object SVMKernel extends Logging with Serializable {
     val labels = mappedData.map((p) => (p._1, p._2.label))
     val kernel = mappedData.cartesian(mappedData)
       .map((prod) => ((prod._1._1, prod._2._1),
-      eval(prod._1._2.features, prod._2._2.features)))
+      eval(prod._1._2, prod._2._2)))
     kernel.cache()
     new SVMKernelMatrix(kernel, length, labels)
   }
 
-  def zipVectorsWithLabels(mappedData: RDD[(Long, Vector)],
-                           labels: RDD[(Long, Double)]):
-  RDD[LabeledPoint] = mappedData.join(labels).map((point) =>
+  def zipVectorsWithLabels(
+      mappedData: RDD[(Long, Vector)],
+      labels: RDD[(Long, Double)]): RDD[LabeledPoint] = 
+    mappedData.join(labels).map((point) =>
     new LabeledPoint(point._2._2, point._2._1))
 
-
+  def unzipIndexedData(mappedData: RDD[(Long, LabeledPoint)]):
+  RDD[LabeledPoint] = mappedData.map((p) => p._2)
 }
 
 /**
@@ -93,38 +179,19 @@ object SVMKernel extends Logging with Serializable {
  * */
 trait KernelMatrix[T] extends Serializable {
   protected val kernel: T
-  def buildFeatureMap(dimensions: Int): RDD[LabeledPoint]
-  def getKernelMatrix(): T = this.kernel
-}
 
-class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)],
-                      private val dimension: Long,
-                      private val labels: RDD[(Long, Double)])
-  extends KernelMatrix[RDD[((Long, Long), Double)]] with Logging with Serializable {
+  def eigenDecomposition(dimensions: Int): (DenseVector[Double], DenseMatrix[Double])
 
-  override def getKernelMatrix():RDD[((Long, Long), Double)] = this.kernel
+  def getKernelMatrix(): T = this.kernel
+}
 
-  /**
-   * Defines a function value which
-   * calculates the multiplication of
-   * the Kernel Matrix with a Breeze
-   * Vector and returns the result as a
-   * Breeze DenseVector.
-   * */
-  val multiplyKernelMatrixOn =
-    (v :breeze.linalg.DenseVector[Double]) => {
-      val vbr = kernel.context.broadcast(v)
-      v.mapPairs((i, _) => {
-        //Get row number i of kernel
-        val row = kernel.filter((point) => i == point._1._1)
-        //multiply with v
-        var sum = kernel.context.accumulator(0.00, "Multiplication product, vector")
-        row.foreach((rownum) => {
-          sum += rownum._2*vbr.value(rownum._1._2.toInt)
-        })
-        sum.value
-      })
-    }
+class SVMKernelMatrix(
+    override protected val kernel: RDD[((Long, Long), Double)],
+    private val dimension: Long,
+    private val labels: RDD[(Long, Double)])
+  extends KernelMatrix[RDD[((Long, Long), Double)]]
+  with Logging
+  with Serializable {
 
   /**
    * Builds an approximate nonlinear feature map
@@ -140,53 +207,13 @@ class SVMKernelMatrix(protected override val kernel: RDD[((Long, Long), Double)]
    *         of all the data points passed to the function.
    *
    * */
-  def buildFeatureMap(dimensions: Int): RDD[LabeledPoint] = {
-
-
+  override def eigenDecomposition(dimensions: Int = this.dimension.toInt):
+  (DenseVector[Double], DenseMatrix[Double]) = {
     logInfo("Eigenvalue decomposition of the kernel matrix using ARPACK.")
-    val decomposition = EigenValueDecomposition
+    EigenValueDecomposition
       .symmetricEigs(
-        multiplyKernelMatrixOn,
+        SVMKernel.multiplyKernelMatrixBy(kernel),
         dimension.toInt, dimensions,
         0.0001, 300)
-
-    logInfo("Applying Nystrom formula to calculate feature map of kernel matrix")
-
-    /*
-    * Get row number i of the
-    * Kernel Matrix
-    * */
-    val rows = kernel.groupBy((couple) => {
-      couple._1._1
-    })
-
-    /*
-    * Join the each row i with the
-    * target label for point i.
-    * */
-    val temp = labels.join(rows)
-
-    /*
-    * Now for each data point,
-    * calculate n dimensions of the
-    * feature map where n is the number
-    * of eigenvalues/vectors obtained from
-    * the Eigen Decomposition.
-    *
-    * phi_i(x) = (1/sqrt(eigenvalue(i)))*Sum(k, 1, n, K(k, x)*eigenvector(i)(k))
-    * */
-    temp.map((datapoint) => {
-      val y: DenseVector[Double] = DenseVector.tabulate(decomposition._1.length){i =>
-        val eigenvector = decomposition._2(::, i)
-        val eigenvalue = decomposition._1(i)
-        var acc = 0.0
-        datapoint._2._2.foreach((p) =>
-          acc += (p._2 * eigenvector(p._1._2.toInt)/Math.sqrt(eigenvalue))
-        )
-        acc
-      }
-      new LabeledPoint(datapoint._2._1, Vectors.fromBreeze(y))
-    })
-
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
index 73bcfa3aab30e..78ffbda08b3d8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropyMeasure.scala
@@ -18,16 +18,15 @@
 package org.apache.spark.mllib.prototype
 
 import org.apache.spark.mllib.kernels.DensityKernel
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.linalg.Vector
 
 /**
  * Models a general entropy measure.
  * Any entropy measure would require a
  * probability distribution
  */
-abstract class EntropyMeasure extends Measure[Vector]
-with Serializable{
+abstract class EntropyMeasure extends Measure[LabeledPoint] with Serializable {
 
   protected val density: DensityKernel
 
@@ -43,7 +42,7 @@ with Serializable{
    * @return The entropy of the data set.
    * */
 
-  def entropy[K](data: RDD[(K, Vector)]): Double
+  def entropy[K](data: RDD[(K, LabeledPoint)]): Double
 
-  override def evaluate[K](data: RDD[(K, Vector)]): Double = this.entropy(data)
+  override def evaluate[K](data: RDD[(K, LabeledPoint)]): Double = this.entropy(data)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
index 1543919c1fe53..34d94544c26a4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.mllib.prototype
 
-import org.apache.spark.{SparkContext, Logging}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.Logging
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 
 /**
@@ -26,27 +26,29 @@ import org.apache.spark.rdd.RDD
  * subset selector
  */
 abstract class EntropySelector
-  extends SubsetSelector[(Long, Vector)] with Serializable
+  extends SubsetSelector[(Long, LabeledPoint)]
+  with Serializable
   with Logging {
   protected val measure: EntropyMeasure
   protected val delta: Double
   protected val MAX_ITERATIONS: Int
 }
 
-class GreedyEntropySelector(m: EntropyMeasure,
-                            del: Double = 0.0001,
-                            max: Int = 5000)
-  extends EntropySelector with Serializable
+class GreedyEntropySelector(
+    m: EntropyMeasure,
+    del: Double = 0.0001,
+    max: Int = 5000)
+  extends EntropySelector
+  with Serializable
   with Logging {
 
   override protected val measure: EntropyMeasure = m
   override protected val delta: Double = del
   override protected val MAX_ITERATIONS: Int =  max
 
-  override def selectPrototypes(data: RDD[(Long, Vector)],
-                                M: Int): RDD[(Long, Vector)] = {
-
-    val context = data.context
+  override def selectPrototypes(
+      data: RDD[(Long, LabeledPoint)],
+      M: Int): RDD[(Long, LabeledPoint)] = {
 
     /*
     * Draw an initial sample of M points
@@ -56,7 +58,7 @@ class GreedyEntropySelector(m: EntropyMeasure,
     * will use as a prototype set to
     * to each iteration
     * */
-
+    logInfo("Initializing the working set, by drawing randomly from the training set")
     val workingset = data.keys.takeSample(false, M)
 
     val r = scala.util.Random
@@ -71,6 +73,7 @@ class GreedyEntropySelector(m: EntropyMeasure,
     var newEntropy: Double = 0.0
     var d: Double = Double.NegativeInfinity
     var rand: Int = 0
+    logInfo("Starting iterative, entropy based greedy subset selection")
     do {
       /*
        * Randomly select a point from
@@ -96,7 +99,7 @@ class GreedyEntropySelector(m: EntropyMeasure,
       * */
       d = newEntropy - oldEntropy
 
-      if(d < 0) {
+      if(d > 0) {
         /*
         * Improvement in entropy so
         * keep the updated working set
@@ -119,7 +122,7 @@ class GreedyEntropySelector(m: EntropyMeasure,
       it += 1
     } while(math.abs(d) >= this.delta &&
       it <= this.MAX_ITERATIONS)
-
+    logInfo("Working set obtained, now starting process of packaging it as an RDD")
     //Time to return the final working set
     data.filter((p) => workingset.contains(p._1))
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
index d2fcbaef381e8..3613dba8a723e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/QuadraticRenyiEntropy.scala
@@ -19,18 +19,20 @@ package org.apache.spark.mllib.prototype
 import breeze.linalg.DenseVector
 import org.apache.spark.Logging
 import org.apache.spark.mllib.kernels.DensityKernel
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.linalg.{Vectors, Vector}
 
 /**
  * Implements the quadratic Renyi Entropy
  */
 class QuadraticRenyiEntropy(dist: DensityKernel)
-  extends EntropyMeasure with Serializable with Logging {
+  extends EntropyMeasure
+  with Serializable
+  with Logging {
 
   val log_e = scala.math.log _
   val sqrt = scala.math.sqrt _
-
   override protected val density: DensityKernel = dist
 
   /**
@@ -48,13 +50,11 @@ class QuadraticRenyiEntropy(dist: DensityKernel)
    *         parameter 'density'.
    * */
 
-  override def entropy[K](data: RDD[(K, Vector)]): Double = {
-    val dim = data.first()._2.size
+  override def entropy[K](data: RDD[(K, LabeledPoint)]): Double = {
+    val dim = data.first()._2.features.size
     val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2))
     -1*log_e(data.cartesian(data).map((couple) =>
-      density.evaluate(
-        Vectors.fromBreeze(couple._1._2.toBreeze :/ root_two),
-        Vectors.fromBreeze(couple._2._2.toBreeze :/ root_two)
-      )).reduce((a,b) => a + b))
+      density.eval(Vectors.fromBreeze(couple._1._2.features.toBreeze :/ root_two -
+        couple._2._2.features.toBreeze :/ root_two))).reduce((a,b) => a + b))
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
index 26f163ada25c2..c03ce34d0ce6c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -16,13 +16,13 @@
  */
 package org.apache.spark.mllib.kernels
 
-import breeze.linalg.norm
+import org.scalatest.FunSuite
 import org.apache.spark.mllib.classification.SVMSuite
 import org.apache.spark.mllib.prototype.{QuadraticRenyiEntropy, GreedyEntropySelector}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.scalatest.FunSuite
 
 class KernelSuite extends FunSuite with MLlibTestSparkContext {
+
   test("Testing evaluate function of Polynomial and RBF Functions"){
 
     val nPoints = 100
@@ -76,18 +76,22 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(mappedData, nPoints)
 
     assert(mappedData.count() == nPoints)
-    val mappedFeaturespoly = kernelMatrixpoly.buildFeatureMap(3)
-    val mappedFeaturesrbf = kernelMatrixRBF.buildFeatureMap(5)
+    val mappedFeaturespoly = poly.featureMapping(
+      kernelMatrixpoly.eigenDecomposition(99)
+    )(mappedData)(mappedData)
+    val mappedFeaturesrbf = rbf.featureMapping(
+      kernelMatrixRBF.eigenDecomposition(99)
+    )(mappedData)(mappedData)
 
-    assert(mappedFeaturespoly.filter((point) => point.features.size == 3).count() == 100)
-    assert(mappedFeaturesrbf.filter((point) => point.features.size == 5).count() == 100)
+    assert(mappedFeaturespoly.filter((point) => point._2.features.size == 99).count() == 100)
+    assert(mappedFeaturesrbf.filter((point) => point._2.features.size == 99).count() == 100)
 
   }
 
   test("Testing optimal bandwidth calculation on Gaussian Kernel" +
     " and maximum entropy subset selection"){
-    val nPoints = 10000
-
+    val nPoints = 1000
+    val subsetSize = 100
     // NOTE: Intercept should be small for generating equal 0s and 1s
     val A = 0.01
     val B = -1.5
@@ -110,9 +114,63 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
 
     val subsetRDD = subsetsel.selectPrototypes(
-      newIndexedRDD,
-      100)
+      SVMKernel.indexedRDD(testRDD),
+      subsetSize)
+
+    assert(subsetRDD.count() == subsetSize)
+  }
+
+  test("Testing rbf kernel with subset selection and feature map extraction") {
+    val nPoints = 1000
+    val nDimensions = 5
+    val subsetSize = 100
+    val unZip = SVMKernel.unzipIndexedData _
+
+    // NOTE: Intercept should be small for generating equal 0s and 1s
+    val A = 0.01
+    val B = -1.5
+    val C = 1.0
+
+    val testData = SVMSuite.generateSVMInput(
+      A,
+      Array[Double](B, C),
+      nPoints,
+      42)
+
+    val testRDD = sc.parallelize(testData, 2)
+
+    val newtestRDD = testRDD.map(_.features)
+    newtestRDD.cache()
+    val kern = new GaussianDensityKernel()
+    kern.optimalBandwidth(newtestRDD)
+    newtestRDD.unpersist()
+    val mappedData = SVMKernel.indexedRDD(testRDD)
+    mappedData.cache()
+
+    val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern)
+    val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
+    val subsetRDD = subsetsel.selectPrototypes(
+      mappedData,
+      subsetSize)
+
+    val rbf = new RBFKernel(0.8)
+    subsetRDD.cache()
+
+    val kernelMatrixRBF = rbf.buildKernelMatrixasRDD(
+      SVMKernel.indexedRDD(unZip(subsetRDD)),
+      subsetSize)
+
+    val featureMap = rbf.featureMapping(
+      kernelMatrixRBF.eigenDecomposition(nDimensions)
+    )(subsetRDD) _
+
+    val mappedFeaturesrbf = featureMap(mappedData)
+
+    mappedFeaturesrbf.cache()
+    mappedData.unpersist()
+
+    assert(mappedFeaturesrbf.count() == nPoints)
+    assert(mappedFeaturesrbf.first()._2.features.size == nDimensions)
 
-    assert(subsetRDD.count() == 100)
   }
 }

From 9367dc14f672976781a9ea6bdf54d503104e6d2a Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Sun, 1 Feb 2015 14:29:27 +0100
Subject: [PATCH 12/14] Minor edits to kernel test suite.

---
 .../org/apache/spark/mllib/kernels/KernelSuite.scala     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
index c03ce34d0ce6c..b45980f7bd972 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/kernels/KernelSuite.scala
@@ -85,7 +85,6 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
 
     assert(mappedFeaturespoly.filter((point) => point._2.features.size == 99).count() == 100)
     assert(mappedFeaturesrbf.filter((point) => point._2.features.size == 99).count() == 100)
-
   }
 
   test("Testing optimal bandwidth calculation on Gaussian Kernel" +
@@ -110,8 +109,8 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     newIndexedRDD.cache()
     newtestRDD.unpersist()
 
-    val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern)
-    val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
+    val entropy = new QuadraticRenyiEntropy(kern)
+    val subsetsel = new GreedyEntropySelector(entropy)
 
     val subsetRDD = subsetsel.selectPrototypes(
       SVMKernel.indexedRDD(testRDD),
@@ -147,8 +146,8 @@ class KernelSuite extends FunSuite with MLlibTestSparkContext {
     val mappedData = SVMKernel.indexedRDD(testRDD)
     mappedData.cache()
 
-    val entropy: QuadraticRenyiEntropy = new QuadraticRenyiEntropy(kern)
-    val subsetsel: GreedyEntropySelector = new GreedyEntropySelector(entropy)
+    val entropy = new QuadraticRenyiEntropy(kern)
+    val subsetsel = new GreedyEntropySelector(entropy)
     val subsetRDD = subsetsel.selectPrototypes(
       mappedData,
       subsetSize)

From 7f4dfae134b915e9c882700c1df695dbaaaee489 Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Mon, 13 Apr 2015 16:54:28 +0200
Subject: [PATCH 13/14] Minor comment clean up

---
 .../org/apache/spark/mllib/kernels/GaussianDensityKernel.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
index 6de1c51c89df4..2fef3ee89f224 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
@@ -99,7 +99,7 @@ class GaussianDensityKernel
    *              the AMISE bandwidth yet and we use this estimator
    *              as a means to get the AMISE bandwidth)
    *
-   * @param kernel The RDD containing the kernel matrix
+   * @param kernel The RDD containing the matrix
    *               consisting of pairs Xi - Xj, where Xi and Xj
    *               are drawn from the original data set.
    *

From 6b95548db3bbf180c330ee69121b371bcbc0f83e Mon Sep 17 00:00:00 2001
From: mandar2812 <mandar2812@gmail.com>
Date: Tue, 14 Apr 2015 02:10:13 +0200
Subject: [PATCH 14/14] Scala style check changes

---
 .../spark/mllib/kernels/DensityKernel.scala   |  2 +-
 .../mllib/kernels/GaussianDensityKernel.scala | 19 +++++++------------
 .../apache/spark/mllib/kernels/Kernel.scala   |  6 +-----
 .../spark/mllib/kernels/KernelEstimator.scala |  2 +-
 .../spark/mllib/kernels/SVMKernel.scala       |  4 ++--
 .../mllib/prototype/EntropySelector.scala     | 12 ++++++------
 6 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
index 7f8b7a06af7cc..8ee4a45556cf0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/DensityKernel.scala
@@ -35,4 +35,4 @@ trait DensityKernel extends Kernel with Serializable  {
 
   protected def derivative(n: Int, x: Double): Double
 
-  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
index 2fef3ee89f224..23260b74f6516 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/GaussianDensityKernel.scala
@@ -70,7 +70,7 @@ class GaussianDensityKernel
     this.bandwidth = b
   }
 
-  override def eval(x: Vector) = evalWithBandwidth(x, this.bandwidth)
+  override def eval(x: Vector):Double = evalWithBandwidth(x, this.bandwidth)
 
   /**
    * Calculates the derivative at point x for the Gaussian
@@ -139,37 +139,32 @@ class GaussianDensityKernel
   override def optimalBandwidth(data: RDD[Vector]): Unit = {
     val dataSize: Long = data.count()
 
-    //First calculate variance of all dimensions
+    // First calculate variance of all dimensions
     val columnStats = Statistics.colStats(data)
     // And then the standard deviation
     val colvar = columnStats.variance.toBreeze
     val colstd = colvar.map((v) => sqrt(v))
 
-    //Now calculate the initial estimates of R(f^6) and R(f^8)
-
-    /*val Rf6: DenseVector[Double] = DenseVector.tabulate(colstd.size)(
-      (i) => -15.0*pow(colstd(i), -7.0)/(16*sqrt(Pi)))*/
+    // Now calculate the initial estimates of R(f^6) and R(f^8)
 
     val Rf8: DenseVector[Double] = DenseVector.tabulate(colstd.size)(
       (i) => 105*pow(colstd(i), -9.0)/(32*sqrt(Pi)))
 
     /*
     * Use the earlier result to calculate
-    * h1 and h2 bandwidths for each dimension
+    * h2, the bandwidth for each dimension
     * */
 
-    /*val h1: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) =>
-      pow(-2*this.derivative(4, 0.0)/(dataSize*this.mu*Rf6(i)), 1/7))*/
     val h2: DenseVector[Double] = DenseVector.tabulate(colstd.size)((i) =>
       pow(-2*this.derivative(6, 0.0)/(dataSize*this.mu*Rf8(i)), 1/9))
 
 
     /*
-    * Use h1 and h2 to calculate more
-    * refined estimates of R(f^6) and R(f^8)
+    * Use h2 to calculate more
+    * refined estimates of R(f^6)
     * */
 
-    //Get an 0-indexed version of the original data set
+    // Get an 0-indexed version of the original data set
     val mappedData = SVMKernel.indexedRDD(data)
 
     /*
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
index 3d945fa6e22b5..13f50077744e3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/Kernel.scala
@@ -16,16 +16,13 @@
  */
 package org.apache.spark.mllib.kernels
 
-import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.regression.LabeledPoint
 
 /**
  * Declares a trait Kernel which would serve
  * as a base trait for all classes implementing
  * Machine Learning Kernels.
- *
- **/
-
+ * */
 trait Kernel {
 
   /**
@@ -36,7 +33,6 @@ trait Kernel {
    * @param y a local Vector.
    *
    * @return the value of the Kernel function.
-   *
    * */
   def evaluate(x: LabeledPoint, y: LabeledPoint): Double
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
index 03cc504bc34c3..1af34acd8668c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/KernelEstimator.scala
@@ -34,7 +34,7 @@ trait KernelEstimator extends Logging {
   /**
    * Calculate the AMISE (Asymptotic Mean Integrated Square Error)
    * optimal bandwidth assignment by 'solve the equation plug in method'
-   **/
+   * */
   def optimalBandwidth(data: RDD[Vector]): Unit
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
index a4a11dc53e2d1..4c26ca07c560e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/kernels/SVMKernel.scala
@@ -112,12 +112,12 @@ object SVMKernel extends Logging with Serializable {
       val result: DenseVector[Double] =
         DenseVector.tabulate(v.length)(
           (i) => {
-            //Get row number i of kernel
+            // Get row number i of kernel
             val row = DenseVector.apply(kernel
               .filter((point) => i == point._1._1)
               .map((p) => p._2)
               .collect())
-            //dot product with v
+            // dot product with v
             vbr.value.t * row
           }
         )
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
index 34d94544c26a4..3a0245a3c2853 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/prototype/EntropySelector.scala
@@ -64,12 +64,12 @@ class GreedyEntropySelector(
     val r = scala.util.Random
     var it: Int = 0
 
-    //All the elements not in the working set
+    // All the elements not in the working set
     var newDataset: RDD[Long] = data.keys.filter((p) => !workingset.contains(p))
-    //Existing best value of the entropy
+    // Existing best value of the entropy
     var oldEntropy: Double = this.measure.evaluate(data.filter((point) =>
       workingset.contains(point._1)))
-    //Store the value of entropy after an element swap
+    // Store the value of entropy after an element swap
     var newEntropy: Double = 0.0
     var d: Double = Double.NegativeInfinity
     var rand: Int = 0
@@ -85,9 +85,9 @@ class GreedyEntropySelector(
 
       val point2 = newDataset.takeSample(false, 1).apply(0)
 
-      //Update the working set
+      // Update the working set
       workingset(rand) = point2
-      //Calculate the new entropy
+      // Calculate the new entropy
       newEntropy = this.measure.evaluate(data.filter((p) =>
         workingset.contains(p._1)))
 
@@ -123,7 +123,7 @@ class GreedyEntropySelector(
     } while(math.abs(d) >= this.delta &&
       it <= this.MAX_ITERATIONS)
     logInfo("Working set obtained, now starting process of packaging it as an RDD")
-    //Time to return the final working set
+    // Time to return the final working set
     data.filter((p) => workingset.contains(p._1))
   }