From 13669dbbf163bd1940d380388f94ad60237b10c0 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 9 Dec 2014 18:13:51 +0800
Subject: [PATCH 01/10] Replace breezeSquaredDistance.

---
 .../org/apache/spark/mllib/util/MLUtils.scala | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index b0d05ae33e1b5..14ac9c4157e7d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -19,8 +19,7 @@ package org.apache.spark.mllib.util
 
 import scala.reflect.ClassTag
 
-import breeze.linalg.{DenseVector => BDV, SparseVector => BSV,
-  squaredDistance => breezeSquaredDistance}
+import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.SparkContext
@@ -264,6 +263,16 @@ object MLUtils {
     }
     Vectors.fromBreeze(vector1)
   }
+ 
+  /**
+   * Returns the squared distance between two Vectors.
+   */
+  def vectorSquaredDistance(v1: Vector, v2: Vector): Double = {
+    v1.toArray.zip(v2.toArray).foldLeft(0.0)((distance, elems) => {
+      val score = elems._1 - elems._2
+      distance + score * score
+    })
+  }
 
   /**
    * Returns the squared Euclidean distance between two vectors. The following formula will be used
@@ -314,12 +323,10 @@ object MLUtils {
       val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) /
         (sqDist + EPSILON)
       if (precisionBound2 > precision) {
-        // TODO: breezeSquaredDistance is slow,
-        // so we should replace it with our own implementation.
-        sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze)
+        sqDist = vectorSquaredDistance(v1, v2)
       }
     } else {
-      sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze)
+      sqDist = vectorSquaredDistance(v1, v2)
     }
     sqDist
   }

From dd415bc448b9c89435a16a5c38c7b0e48a5de638 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 10 Dec 2014 14:44:40 +0800
Subject: [PATCH 02/10] Consider different cases of SparseVector and
 DenseVector.

---
 .../org/apache/spark/mllib/util/MLUtils.scala | 52 +++++++++++++++++--
 .../spark/mllib/util/MLUtilsSuite.scala       |  6 +++
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 14ac9c4157e7d..90b3fe5a207d7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -27,7 +27,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.PartitionwiseSampledRDD
 import org.apache.spark.util.random.BernoulliCellSampler
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
+import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
@@ -268,12 +268,56 @@ object MLUtils {
    * Returns the squared distance between two Vectors.
    */
   def vectorSquaredDistance(v1: Vector, v2: Vector): Double = {
-    v1.toArray.zip(v2.toArray).foldLeft(0.0)((distance, elems) => {
-      val score = elems._1 - elems._2
-      distance + score * score
+    var squaredDistance = 0.0
+    (v1, v2) match { 
+      case (v1: SparseVector, v2: SparseVector) =>
+        v1.indices.intersect(v2.indices).foreach((idx) => {
+          val score = v1(idx) - v2(idx)
+          squaredDistance += score * score
+        })
+
+        v1.indices.diff(v2.indices).foreach((idx) => {
+          val score = v1(idx)
+          squaredDistance += score * score
+        })
+ 
+        v2.indices.diff(v1.indices).foreach((idx) => {
+          val score = v2(idx)
+          squaredDistance += score * score
+        })
+
+      case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 =>
+        squaredDistance = vectorSquaredDistance(v1, v2)
+
+      case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 =>
+        squaredDistance = vectorSquaredDistance(v2, v1)
+
+      case (v1, v2) =>
+        squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0)((distance, elems) => {
+          val score = elems._1 - elems._2
+          distance + score * score
+        })
+    }
+    squaredDistance
+  }
+
+  /**
+   * Returns the squared distance between DenseVector and SparseVector.
+   */
+  def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = {
+    var squaredDistance = 0.0
+    v1.indices.foreach((idx) => {
+      val score = v1(idx) - v2(idx)
+      squaredDistance += score * score
+    })
+    (0 to v2.size - 1).toArray.diff(v1.indices).foreach((idx) => {
+      val score = v2(idx)
+      squaredDistance += score * score
     })
+    squaredDistance
   }
 
+
   /**
    * Returns the squared Euclidean distance between two vectors. The following formula will be used
    * if it does not introduce too much numerical error:
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index df07987093fbf..304a3e8b7a59d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -52,12 +52,18 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext {
       val values = indices.map(i => a(i))
       val v2 = Vectors.sparse(n, indices, values)
       val norm2 = Vectors.norm(v2, 2.0)
+      val v3 = Vectors.sparse(n, indices, indices.map(i => a(i) + 0.5))
+      val norm3 = Vectors.norm(v3, 2.0)
       val squaredDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze)
       val fastSquaredDist1 = fastSquaredDistance(v1, norm1, v2, norm2, precision)
       assert((fastSquaredDist1 - squaredDist) <= precision * squaredDist, s"failed with m = $m")
       val fastSquaredDist2 =
         fastSquaredDistance(v1, norm1, Vectors.dense(v2.toArray), norm2, precision)
       assert((fastSquaredDist2 - squaredDist) <= precision * squaredDist, s"failed with m = $m")
+      val squaredDist2 = breezeSquaredDistance(v2.toBreeze, v3.toBreeze)
+      val fastSquaredDist3 =
+        fastSquaredDistance(v2, norm2, v3, norm3, precision)
+      assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m")
     }
   }
 

From d3e06284e8b12f86dbff7d41651a45d50c4bf1b5 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 11 Dec 2014 16:39:32 +0800
Subject: [PATCH 03/10] Make the methods private.

---
 .../src/main/scala/org/apache/spark/mllib/util/MLUtils.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 90b3fe5a207d7..04ac6121df3c9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -267,7 +267,7 @@ object MLUtils {
   /**
    * Returns the squared distance between two Vectors.
    */
-  def vectorSquaredDistance(v1: Vector, v2: Vector): Double = {
+  private[util] def vectorSquaredDistance(v1: Vector, v2: Vector): Double = {
     var squaredDistance = 0.0
     (v1, v2) match { 
       case (v1: SparseVector, v2: SparseVector) =>
@@ -304,7 +304,7 @@ object MLUtils {
   /**
    * Returns the squared distance between DenseVector and SparseVector.
    */
-  def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = {
+  private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = {
     var squaredDistance = 0.0
     v1.indices.foreach((idx) => {
       val score = v1(idx) - v2(idx)

From a36e09fd18db812edb65c54b0f09a12fa6839a66 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 11 Dec 2014 17:05:17 +0800
Subject: [PATCH 04/10] Use while-loop to replace foreach for better
 performance.

---
 .../org/apache/spark/mllib/util/MLUtils.scala | 45 ++++++++++++++-----
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 04ac6121df3c9..7b25ef6ecbc62 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -271,20 +271,33 @@ object MLUtils {
     var squaredDistance = 0.0
     (v1, v2) match { 
       case (v1: SparseVector, v2: SparseVector) =>
-        v1.indices.intersect(v2.indices).foreach((idx) => {
+        var count = 0
+        var indices = v1.indices.intersect(v2.indices)
+
+        while (count < indices.length) {
+          val idx = indices(count)
           val score = v1(idx) - v2(idx)
           squaredDistance += score * score
-        })
+          count += 1
+        }
 
-        v1.indices.diff(v2.indices).foreach((idx) => {
+        count = 0
+        indices = v1.indices.diff(v2.indices)
+        while (count < indices.length) {
+          val idx = indices(count)
           val score = v1(idx)
           squaredDistance += score * score
-        })
- 
-        v2.indices.diff(v1.indices).foreach((idx) => {
+          count += 1
+        }
+        
+        count = 0
+        indices = v2.indices.diff(v1.indices)
+        while (count < indices.length) {
+          val idx = indices(count)
           val score = v2(idx)
           squaredDistance += score * score
-        })
+          count += 1
+        }
 
       case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 =>
         squaredDistance = vectorSquaredDistance(v1, v2)
@@ -306,14 +319,24 @@ object MLUtils {
    */
   private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = {
     var squaredDistance = 0.0
-    v1.indices.foreach((idx) => {
+    var count = 0
+    var indices = v1.indices
+
+    while (count < indices.length) {
+      val idx = indices(count)
       val score = v1(idx) - v2(idx)
       squaredDistance += score * score
-    })
-    (0 to v2.size - 1).toArray.diff(v1.indices).foreach((idx) => {
+      count += 1
+    }
+
+    count = 0
+    indices = (0 to v2.size - 1).toArray.diff(v1.indices)
+    while (count < indices.length) {
+      val idx = indices(count)
       val score = v2(idx)
       squaredDistance += score * score
-    })
+      count += 1
+    }
     squaredDistance
   }
 

From f4f5ebb59b10ab09414690498bdeadceb94ca2e5 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 11 Dec 2014 20:46:00 +0800
Subject: [PATCH 05/10] Follow BLAS.dot pattern to replace intersect, diff with
 while-loop.

---
 .../org/apache/spark/mllib/util/MLUtils.scala | 77 +++++++++++--------
 1 file changed, 43 insertions(+), 34 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 7b25ef6ecbc62..89fc88a1cb60b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -271,32 +271,34 @@ object MLUtils {
     var squaredDistance = 0.0
     (v1, v2) match { 
       case (v1: SparseVector, v2: SparseVector) =>
-        var count = 0
-        var indices = v1.indices.intersect(v2.indices)
-
-        while (count < indices.length) {
-          val idx = indices(count)
-          val score = v1(idx) - v2(idx)
-          squaredDistance += score * score
-          count += 1
-        }
-
-        count = 0
-        indices = v1.indices.diff(v2.indices)
-        while (count < indices.length) {
-          val idx = indices(count)
-          val score = v1(idx)
-          squaredDistance += score * score
-          count += 1
-        }
+        val v1Values = v1.values
+        val v1Indices = v1.indices
+        val v2Values = v2.values
+        val v2Indices = v2.indices
+        val nnzv1 = v1Indices.size
+        val nnzv2 = v2Indices.size
         
-        count = 0
-        indices = v2.indices.diff(v1.indices)
-        while (count < indices.length) {
-          val idx = indices(count)
-          val score = v2(idx)
-          squaredDistance += score * score
-          count += 1
+        var kv1 = 0
+        var kv2 = 0
+        var score = 0.0
+        while (kv1 < nnzv1) {
+          val iv1 = v1Indices(kv1)
+ 
+          if (kv2 >= nnzv2 || iv1 < v2Indices(kv2)) {
+            score = v1Values(kv1)
+            squaredDistance += score * score
+          }
+          while (kv2 < nnzv2 && v2Indices(kv2) < iv1) {
+            score = v2Values(kv2)
+            squaredDistance += score * score
+            kv2 += 1
+          }
+          if (kv2 < nnzv2 && v2Indices(kv2) == iv1) {
+            score = v1Values(kv1) - v2Values(kv2)
+            squaredDistance += score * score
+            kv2 += 1
+          }
+          kv1 += 1
         }
 
       case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 =>
@@ -321,21 +323,28 @@ object MLUtils {
     var squaredDistance = 0.0
     var count = 0
     var indices = v1.indices
-
+    var score = 0.0
     while (count < indices.length) {
       val idx = indices(count)
-      val score = v1(idx) - v2(idx)
+      score = v1(idx) - v2(idx)
       squaredDistance += score * score
       count += 1
     }
 
-    count = 0
-    indices = (0 to v2.size - 1).toArray.diff(v1.indices)
-    while (count < indices.length) {
-      val idx = indices(count)
-      val score = v2(idx)
-      squaredDistance += score * score
-      count += 1
+    var kv1 = 0
+    var kv2 = 0
+    var iv1 = indices(kv1)
+    val nnzv2 = v2.size
+    while (kv2 < nnzv2) {
+      if (kv2 < iv1 || kv2 > iv1) {
+        score = v2(kv2)
+        squaredDistance += score * score
+      }
+      if (kv2 == iv1 && kv1 < indices.length - 1) {
+        kv1 += 1
+        iv1 = indices(kv1)
+      }
+      kv2 += 1
     }
     squaredDistance
   }

From 35db395c34385f5ee88c914467956cda9a71cc7d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 16 Dec 2014 18:02:57 +0800
Subject: [PATCH 06/10] Fix bug and some modifications for comments.

---
 .../org/apache/spark/mllib/util/MLUtils.scala | 50 ++++++++-----------
 .../spark/mllib/util/MLUtilsSuite.scala       |  8 +++
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 89fc88a1cb60b..84b87b5c78d28 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -280,27 +280,24 @@ object MLUtils {
         
         var kv1 = 0
         var kv2 = 0
-        var score = 0.0
-        while (kv1 < nnzv1) {
-          val iv1 = v1Indices(kv1)
+        while (kv1 < nnzv1 || kv2 < nnzv2) {
+          var score = 0.0
  
-          if (kv2 >= nnzv2 || iv1 < v2Indices(kv2)) {
+          if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) {
             score = v1Values(kv1)
-            squaredDistance += score * score
-          }
-          while (kv2 < nnzv2 && v2Indices(kv2) < iv1) {
+            kv1 += 1
+          } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) {
             score = v2Values(kv2)
-            squaredDistance += score * score
             kv2 += 1
-          }
-          if (kv2 < nnzv2 && v2Indices(kv2) == iv1) {
+          } else if ((kv1 < nnzv1 && kv2 < nnzv2) && v1Indices(kv1) == v2Indices(kv2)) {
             score = v1Values(kv1) - v2Values(kv2)
-            squaredDistance += score * score
+            kv1 += 1
             kv2 += 1
           }
-          kv1 += 1
+          squaredDistance += score * score
         }
 
+      // The following two cases are used to handle dense and approximately dense vectors
       case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 =>
         squaredDistance = vectorSquaredDistance(v1, v2)
 
@@ -308,10 +305,10 @@ object MLUtils {
         squaredDistance = vectorSquaredDistance(v2, v1)
 
       case (v1, v2) =>
-        squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0)((distance, elems) => {
+        squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){(distance, elems) =>
           val score = elems._1 - elems._2
           distance + score * score
-        })
+        }
     }
     squaredDistance
   }
@@ -320,29 +317,26 @@ object MLUtils {
    * Returns the squared distance between DenseVector and SparseVector.
    */
   private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = {
-    var squaredDistance = 0.0
-    var count = 0
-    var indices = v1.indices
-    var score = 0.0
-    while (count < indices.length) {
-      val idx = indices(count)
-      score = v1(idx) - v2(idx)
-      squaredDistance += score * score
-      count += 1
-    }
-
     var kv1 = 0
     var kv2 = 0
+    var indices = v1.indices
+    var squaredDistance = 0.0
     var iv1 = indices(kv1)
     val nnzv2 = v2.size
+
     while (kv2 < nnzv2) {
+      var score = 0.0
       if (kv2 < iv1 || kv2 > iv1) {
         score = v2(kv2)
         squaredDistance += score * score
       }
-      if (kv2 == iv1 && kv1 < indices.length - 1) {
-        kv1 += 1
-        iv1 = indices(kv1)
+      if (kv2 == iv1 && kv1 < indices.length) {
+        score = v1.values(iv1) - v2(kv2)
+        squaredDistance += score * score
+        if (kv1 < indices.length - 1) {
+          kv1 += 1
+          iv1 = indices(kv1)
+        }
       }
       kv2 += 1
     }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index 304a3e8b7a59d..640041b7bcf8d 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -64,6 +64,14 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext {
       val fastSquaredDist3 =
         fastSquaredDistance(v2, norm2, v3, norm3, precision)
       assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m")
+      if (m > 10) { 
+        val v4 = Vectors.sparse(n, indices.slice(0, m - 10), indices.map(i => a(i) + 0.5).slice(0, m - 10))
+        val norm4 = Vectors.norm(v4, 2.0)
+        val squaredDist = breezeSquaredDistance(v2.toBreeze, v4.toBreeze)
+        val fastSquaredDist =
+          fastSquaredDistance(v2, norm2, v4, norm4, precision)
+        assert((fastSquaredDist - squaredDist) <= precision * squaredDist, s"failed with m = $m")
+      }
     }
   }
 

From 44a65adc6c6c7c07969dc777b0aa681abe769a9e Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 17 Dec 2014 14:26:25 +0800
Subject: [PATCH 07/10] Modified for comments.

---
 .../org/apache/spark/mllib/util/MLUtils.scala    | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 84b87b5c78d28..cfc5151b7d102 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -289,7 +289,7 @@ object MLUtils {
           } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) {
             score = v2Values(kv2)
             kv2 += 1
-          } else if ((kv1 < nnzv1 && kv2 < nnzv2) && v1Indices(kv1) == v2Indices(kv2)) {
+          } else if (v1Indices(kv1) == v2Indices(kv2)) {
             score = v1Values(kv1) - v2Values(kv2)
             kv1 += 1
             kv2 += 1
@@ -297,15 +297,15 @@ object MLUtils {
           squaredDistance += score * score
         }
 
-      // The following two cases are used to handle dense and approximately dense vectors
       case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 =>
         squaredDistance = vectorSquaredDistance(v1, v2)
 
       case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 =>
         squaredDistance = vectorSquaredDistance(v2, v1)
 
+      // When a SparseVector is approximately dense, we treat it as a DenseVector
       case (v1, v2) =>
-        squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){(distance, elems) =>
+        squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) =>
           val score = elems._1 - elems._2
           distance + score * score
         }
@@ -319,25 +319,23 @@ object MLUtils {
   private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = {
     var kv1 = 0
     var kv2 = 0
-    var indices = v1.indices
+    val indices = v1.indices
     var squaredDistance = 0.0
     var iv1 = indices(kv1)
     val nnzv2 = v2.size
 
     while (kv2 < nnzv2) {
       var score = 0.0
-      if (kv2 < iv1 || kv2 > iv1) {
+      if (kv2 != iv1) {
         score = v2(kv2)
-        squaredDistance += score * score
-      }
-      if (kv2 == iv1 && kv1 < indices.length) {
+      } else {
         score = v1.values(iv1) - v2(kv2)
-        squaredDistance += score * score
         if (kv1 < indices.length - 1) {
           kv1 += 1
           iv1 = indices(kv1)
         }
       }
+      squaredDistance += score * score
       kv2 += 1
     }
     squaredDistance

From 91849d0b3fc474aee1049683a440d7b97118f978 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 18 Dec 2014 14:48:46 +0800
Subject: [PATCH 08/10] Modified for comment.

---
 mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +-
 .../test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index cfc5151b7d102..4af3e4395cc47 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -289,7 +289,7 @@ object MLUtils {
           } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) {
             score = v2Values(kv2)
             kv2 += 1
-          } else if (v1Indices(kv1) == v2Indices(kv2)) {
+          } else {
             score = v1Values(kv1) - v2Values(kv2)
             kv1 += 1
             kv2 += 1
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
index 640041b7bcf8d..7778847f8b72a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala
@@ -65,7 +65,8 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext {
         fastSquaredDistance(v2, norm2, v3, norm3, precision)
       assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m")
       if (m > 10) { 
-        val v4 = Vectors.sparse(n, indices.slice(0, m - 10), indices.map(i => a(i) + 0.5).slice(0, m - 10))
+        val v4 = Vectors.sparse(n, indices.slice(0, m - 10),
+          indices.map(i => a(i) + 0.5).slice(0, m - 10))
         val norm4 = Vectors.norm(v4, 2.0)
         val squaredDist = breezeSquaredDistance(v2.toBreeze, v4.toBreeze)
         val fastSquaredDist =

From ba34422f940abf352e705e2d466690c6568a88d0 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 23 Dec 2014 18:28:18 +0800
Subject: [PATCH 09/10] Fix bug.

---
 .../src/main/scala/org/apache/spark/mllib/util/MLUtils.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 4af3e4395cc47..81f51c883baf0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -323,13 +323,13 @@ object MLUtils {
     var squaredDistance = 0.0
     var iv1 = indices(kv1)
     val nnzv2 = v2.size
-
+   
     while (kv2 < nnzv2) {
       var score = 0.0
       if (kv2 != iv1) {
         score = v2(kv2)
       } else {
-        score = v1.values(iv1) - v2(kv2)
+        score = v1.values(kv1) - v2(kv2)
         if (kv1 < indices.length - 1) {
           kv1 += 1
           iv1 = indices(kv1)

From f28b275e153b3d093bf063c53efe1dea91084918 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 31 Dec 2014 18:28:03 +0800
Subject: [PATCH 10/10] Move the implementation to linalg.Vectors and rename as
 sqdist.

---
 .../apache/spark/mllib/linalg/Vectors.scala   | 80 ++++++++++++++++++
 .../org/apache/spark/mllib/util/MLUtils.scala | 82 +------------------
 2 files changed, 82 insertions(+), 80 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 01f3f90577142..6a782b079aac3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -312,6 +312,86 @@ object Vectors {
       math.pow(sum, 1.0 / p)
     }
   }
+ 
+  /**
+   * Returns the squared distance between two Vectors.
+   * @param v1 first Vector.
+   * @param v2 second Vector.
+   * @return squared distance between two Vectors.
+   */
+  def sqdist(v1: Vector, v2: Vector): Double = {
+    var squaredDistance = 0.0
+    (v1, v2) match { 
+      case (v1: SparseVector, v2: SparseVector) =>
+        val v1Values = v1.values
+        val v1Indices = v1.indices
+        val v2Values = v2.values
+        val v2Indices = v2.indices
+        val nnzv1 = v1Indices.size
+        val nnzv2 = v2Indices.size
+        
+        var kv1 = 0
+        var kv2 = 0
+        while (kv1 < nnzv1 || kv2 < nnzv2) {
+          var score = 0.0
+ 
+          if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) {
+            score = v1Values(kv1)
+            kv1 += 1
+          } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) {
+            score = v2Values(kv2)
+            kv2 += 1
+          } else {
+            score = v1Values(kv1) - v2Values(kv2)
+            kv1 += 1
+            kv2 += 1
+          }
+          squaredDistance += score * score
+        }
+
+      case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 =>
+        squaredDistance = sqdist(v1, v2)
+
+      case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 =>
+        squaredDistance = sqdist(v2, v1)
+
+      // When a SparseVector is approximately dense, we treat it as a DenseVector
+      case (v1, v2) =>
+        squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) =>
+          val score = elems._1 - elems._2
+          distance + score * score
+        }
+    }
+    squaredDistance
+  }
+
+  /**
+   * Returns the squared distance between DenseVector and SparseVector.
+   */
+  private[mllib] def sqdist(v1: SparseVector, v2: DenseVector): Double = {
+    var kv1 = 0
+    var kv2 = 0
+    val indices = v1.indices
+    var squaredDistance = 0.0
+    var iv1 = indices(kv1)
+    val nnzv2 = v2.size
+   
+    while (kv2 < nnzv2) {
+      var score = 0.0
+      if (kv2 != iv1) {
+        score = v2(kv2)
+      } else {
+        score = v1.values(kv1) - v2(kv2)
+        if (kv1 < indices.length - 1) {
+          kv1 += 1
+          iv1 = indices(kv1)
+        }
+      }
+      squaredDistance += score * score
+      kv2 += 1
+    }
+    squaredDistance
+  }
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index f0b9e1eea7bb4..c7843464a7505 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -266,84 +266,6 @@ object MLUtils {
     Vectors.fromBreeze(vector1)
   }
  
-  /**
-   * Returns the squared distance between two Vectors.
-   */
-  private[util] def vectorSquaredDistance(v1: Vector, v2: Vector): Double = {
-    var squaredDistance = 0.0
-    (v1, v2) match { 
-      case (v1: SparseVector, v2: SparseVector) =>
-        val v1Values = v1.values
-        val v1Indices = v1.indices
-        val v2Values = v2.values
-        val v2Indices = v2.indices
-        val nnzv1 = v1Indices.size
-        val nnzv2 = v2Indices.size
-        
-        var kv1 = 0
-        var kv2 = 0
-        while (kv1 < nnzv1 || kv2 < nnzv2) {
-          var score = 0.0
- 
-          if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) {
-            score = v1Values(kv1)
-            kv1 += 1
-          } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) {
-            score = v2Values(kv2)
-            kv2 += 1
-          } else {
-            score = v1Values(kv1) - v2Values(kv2)
-            kv1 += 1
-            kv2 += 1
-          }
-          squaredDistance += score * score
-        }
-
-      case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 =>
-        squaredDistance = vectorSquaredDistance(v1, v2)
-
-      case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 =>
-        squaredDistance = vectorSquaredDistance(v2, v1)
-
-      // When a SparseVector is approximately dense, we treat it as a DenseVector
-      case (v1, v2) =>
-        squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) =>
-          val score = elems._1 - elems._2
-          distance + score * score
-        }
-    }
-    squaredDistance
-  }
-
-  /**
-   * Returns the squared distance between DenseVector and SparseVector.
-   */
-  private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = {
-    var kv1 = 0
-    var kv2 = 0
-    val indices = v1.indices
-    var squaredDistance = 0.0
-    var iv1 = indices(kv1)
-    val nnzv2 = v2.size
-   
-    while (kv2 < nnzv2) {
-      var score = 0.0
-      if (kv2 != iv1) {
-        score = v2(kv2)
-      } else {
-        score = v1.values(kv1) - v2(kv2)
-        if (kv1 < indices.length - 1) {
-          kv1 += 1
-          iv1 = indices(kv1)
-        }
-      }
-      squaredDistance += score * score
-      kv2 += 1
-    }
-    squaredDistance
-  }
-
-
   /**
    * Returns the squared Euclidean distance between two vectors. The following formula will be used
    * if it does not introduce too much numerical error:
@@ -393,10 +315,10 @@ object MLUtils {
       val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) /
         (sqDist + EPSILON)
       if (precisionBound2 > precision) {
-        sqDist = vectorSquaredDistance(v1, v2)
+        sqDist = Vectors.sqdist(v1, v2)
       }
     } else {
-      sqDist = vectorSquaredDistance(v1, v2)
+      sqDist = Vectors.sqdist(v1, v2)
     }
     sqDist
   }