From 13669dbbf163bd1940d380388f94ad60237b10c0 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 9 Dec 2014 18:13:51 +0800 Subject: [PATCH 01/10] Replace breezeSquaredDistance. --- .../org/apache/spark/mllib/util/MLUtils.scala | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index b0d05ae33e1b5..14ac9c4157e7d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -19,8 +19,7 @@ package org.apache.spark.mllib.util import scala.reflect.ClassTag -import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, - squaredDistance => breezeSquaredDistance} +import breeze.linalg.{DenseVector => BDV, SparseVector => BSV} import org.apache.spark.annotation.Experimental import org.apache.spark.SparkContext @@ -264,6 +263,16 @@ object MLUtils { } Vectors.fromBreeze(vector1) } + + /** + * Returns the squared distance between two Vectors. + */ + def vectorSquaredDistance(v1: Vector, v2: Vector): Double = { + v1.toArray.zip(v2.toArray).foldLeft(0.0)((distance, elems) => { + val score = elems._1 - elems._2 + distance + score * score + }) + } /** * Returns the squared Euclidean distance between two vectors. The following formula will be used @@ -314,12 +323,10 @@ object MLUtils { val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) / (sqDist + EPSILON) if (precisionBound2 > precision) { - // TODO: breezeSquaredDistance is slow, - // so we should replace it with our own implementation. - sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze) + sqDist = vectorSquaredDistance(v1, v2) } } else { - sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze) + sqDist = vectorSquaredDistance(v1, v2) } sqDist } From dd415bc448b9c89435a16a5c38c7b0e48a5de638 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 10 Dec 2014 14:44:40 +0800 Subject: [PATCH 02/10] Consider different cases of SparseVector and DenseVector. --- .../org/apache/spark/mllib/util/MLUtils.scala | 52 +++++++++++++++++-- .../spark/mllib/util/MLUtilsSuite.scala | 6 +++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 14ac9c4157e7d..90b3fe5a207d7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -27,7 +27,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.rdd.PartitionwiseSampledRDD import org.apache.spark.util.random.BernoulliCellSampler import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors} +import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS.dot import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext @@ -268,12 +268,56 @@ object MLUtils { * Returns the squared distance between two Vectors. */ def vectorSquaredDistance(v1: Vector, v2: Vector): Double = { - v1.toArray.zip(v2.toArray).foldLeft(0.0)((distance, elems) => { - val score = elems._1 - elems._2 - distance + score * score + var squaredDistance = 0.0 + (v1, v2) match { + case (v1: SparseVector, v2: SparseVector) => + v1.indices.intersect(v2.indices).foreach((idx) => { + val score = v1(idx) - v2(idx) + squaredDistance += score * score + }) + + v1.indices.diff(v2.indices).foreach((idx) => { + val score = v1(idx) + squaredDistance += score * score + }) + + v2.indices.diff(v1.indices).foreach((idx) => { + val score = v2(idx) + squaredDistance += score * score + }) + + case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 => + squaredDistance = vectorSquaredDistance(v1, v2) + + case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 => + squaredDistance = vectorSquaredDistance(v2, v1) + + case (v1, v2) => + squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0)((distance, elems) => { + val score = elems._1 - elems._2 + distance + score * score + }) + } + squaredDistance + } + + /** + * Returns the squared distance between DenseVector and SparseVector. + */ + def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = { + var squaredDistance = 0.0 + v1.indices.foreach((idx) => { + val score = v1(idx) - v2(idx) + squaredDistance += score * score + }) + (0 to v2.size - 1).toArray.diff(v1.indices).foreach((idx) => { + val score = v2(idx) + squaredDistance += score * score }) + squaredDistance } + /** * Returns the squared Euclidean distance between two vectors. The following formula will be used * if it does not introduce too much numerical error: diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala index df07987093fbf..304a3e8b7a59d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala @@ -52,12 +52,18 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext { val values = indices.map(i => a(i)) val v2 = Vectors.sparse(n, indices, values) val norm2 = Vectors.norm(v2, 2.0) + val v3 = Vectors.sparse(n, indices, indices.map(i => a(i) + 0.5)) + val norm3 = Vectors.norm(v3, 2.0) val squaredDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze) val fastSquaredDist1 = fastSquaredDistance(v1, norm1, v2, norm2, precision) assert((fastSquaredDist1 - squaredDist) <= precision * squaredDist, s"failed with m = $m") val fastSquaredDist2 = fastSquaredDistance(v1, norm1, Vectors.dense(v2.toArray), norm2, precision) assert((fastSquaredDist2 - squaredDist) <= precision * squaredDist, s"failed with m = $m") + val squaredDist2 = breezeSquaredDistance(v2.toBreeze, v3.toBreeze) + val fastSquaredDist3 = + fastSquaredDistance(v2, norm2, v3, norm3, precision) + assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m") } } From d3e06284e8b12f86dbff7d41651a45d50c4bf1b5 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 11 Dec 2014 16:39:32 +0800 Subject: [PATCH 03/10] Make the methods private. --- .../src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 90b3fe5a207d7..04ac6121df3c9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -267,7 +267,7 @@ object MLUtils { /** * Returns the squared distance between two Vectors. */ - def vectorSquaredDistance(v1: Vector, v2: Vector): Double = { + private[util] def vectorSquaredDistance(v1: Vector, v2: Vector): Double = { var squaredDistance = 0.0 (v1, v2) match { case (v1: SparseVector, v2: SparseVector) => @@ -304,7 +304,7 @@ object MLUtils { /** * Returns the squared distance between DenseVector and SparseVector. */ - def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = { + private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = { var squaredDistance = 0.0 v1.indices.foreach((idx) => { val score = v1(idx) - v2(idx) From a36e09fd18db812edb65c54b0f09a12fa6839a66 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 11 Dec 2014 17:05:17 +0800 Subject: [PATCH 04/10] Use while-loop to replace foreach for better performance. --- .../org/apache/spark/mllib/util/MLUtils.scala | 45 ++++++++++++++----- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 04ac6121df3c9..7b25ef6ecbc62 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -271,20 +271,33 @@ object MLUtils { var squaredDistance = 0.0 (v1, v2) match { case (v1: SparseVector, v2: SparseVector) => - v1.indices.intersect(v2.indices).foreach((idx) => { + var count = 0 + var indices = v1.indices.intersect(v2.indices) + + while (count < indices.length) { + val idx = indices(count) val score = v1(idx) - v2(idx) squaredDistance += score * score - }) + count += 1 + } - v1.indices.diff(v2.indices).foreach((idx) => { + count = 0 + indices = v1.indices.diff(v2.indices) + while (count < indices.length) { + val idx = indices(count) val score = v1(idx) squaredDistance += score * score - }) - - v2.indices.diff(v1.indices).foreach((idx) => { + count += 1 + } + + count = 0 + indices = v2.indices.diff(v1.indices) + while (count < indices.length) { + val idx = indices(count) val score = v2(idx) squaredDistance += score * score - }) + count += 1 + } case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 => squaredDistance = vectorSquaredDistance(v1, v2) @@ -306,14 +319,24 @@ object MLUtils { */ private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = { var squaredDistance = 0.0 - v1.indices.foreach((idx) => { + var count = 0 + var indices = v1.indices + + while (count < indices.length) { + val idx = indices(count) val score = v1(idx) - v2(idx) squaredDistance += score * score - }) - (0 to v2.size - 1).toArray.diff(v1.indices).foreach((idx) => { + count += 1 + } + + count = 0 + indices = (0 to v2.size - 1).toArray.diff(v1.indices) + while (count < indices.length) { + val idx = indices(count) val score = v2(idx) squaredDistance += score * score - }) + count += 1 + } squaredDistance } From f4f5ebb59b10ab09414690498bdeadceb94ca2e5 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 11 Dec 2014 20:46:00 +0800 Subject: [PATCH 05/10] Follow BLAS.dot pattern to replace intersect, diff with while-loop. --- .../org/apache/spark/mllib/util/MLUtils.scala | 77 +++++++++++-------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 7b25ef6ecbc62..89fc88a1cb60b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -271,32 +271,34 @@ object MLUtils { var squaredDistance = 0.0 (v1, v2) match { case (v1: SparseVector, v2: SparseVector) => - var count = 0 - var indices = v1.indices.intersect(v2.indices) - - while (count < indices.length) { - val idx = indices(count) - val score = v1(idx) - v2(idx) - squaredDistance += score * score - count += 1 - } - - count = 0 - indices = v1.indices.diff(v2.indices) - while (count < indices.length) { - val idx = indices(count) - val score = v1(idx) - squaredDistance += score * score - count += 1 - } + val v1Values = v1.values + val v1Indices = v1.indices + val v2Values = v2.values + val v2Indices = v2.indices + val nnzv1 = v1Indices.size + val nnzv2 = v2Indices.size - count = 0 - indices = v2.indices.diff(v1.indices) - while (count < indices.length) { - val idx = indices(count) - val score = v2(idx) - squaredDistance += score * score - count += 1 + var kv1 = 0 + var kv2 = 0 + var score = 0.0 + while (kv1 < nnzv1) { + val iv1 = v1Indices(kv1) + + if (kv2 >= nnzv2 || iv1 < v2Indices(kv2)) { + score = v1Values(kv1) + squaredDistance += score * score + } + while (kv2 < nnzv2 && v2Indices(kv2) < iv1) { + score = v2Values(kv2) + squaredDistance += score * score + kv2 += 1 + } + if (kv2 < nnzv2 && v2Indices(kv2) == iv1) { + score = v1Values(kv1) - v2Values(kv2) + squaredDistance += score * score + kv2 += 1 + } + kv1 += 1 } case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 => @@ -321,21 +323,28 @@ object MLUtils { var squaredDistance = 0.0 var count = 0 var indices = v1.indices - + var score = 0.0 while (count < indices.length) { val idx = indices(count) - val score = v1(idx) - v2(idx) + score = v1(idx) - v2(idx) squaredDistance += score * score count += 1 } - count = 0 - indices = (0 to v2.size - 1).toArray.diff(v1.indices) - while (count < indices.length) { - val idx = indices(count) - val score = v2(idx) - squaredDistance += score * score - count += 1 + var kv1 = 0 + var kv2 = 0 + var iv1 = indices(kv1) + val nnzv2 = v2.size + while (kv2 < nnzv2) { + if (kv2 < iv1 || kv2 > iv1) { + score = v2(kv2) + squaredDistance += score * score + } + if (kv2 == iv1 && kv1 < indices.length - 1) { + kv1 += 1 + iv1 = indices(kv1) + } + kv2 += 1 } squaredDistance } From 35db395c34385f5ee88c914467956cda9a71cc7d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 16 Dec 2014 18:02:57 +0800 Subject: [PATCH 06/10] Fix bug and some modifications for comments. --- .../org/apache/spark/mllib/util/MLUtils.scala | 50 ++++++++----------- .../spark/mllib/util/MLUtilsSuite.scala | 8 +++ 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 89fc88a1cb60b..84b87b5c78d28 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -280,27 +280,24 @@ object MLUtils { var kv1 = 0 var kv2 = 0 - var score = 0.0 - while (kv1 < nnzv1) { - val iv1 = v1Indices(kv1) + while (kv1 < nnzv1 || kv2 < nnzv2) { + var score = 0.0 - if (kv2 >= nnzv2 || iv1 < v2Indices(kv2)) { + if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) { score = v1Values(kv1) - squaredDistance += score * score - } - while (kv2 < nnzv2 && v2Indices(kv2) < iv1) { + kv1 += 1 + } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) { score = v2Values(kv2) - squaredDistance += score * score kv2 += 1 - } - if (kv2 < nnzv2 && v2Indices(kv2) == iv1) { + } else if ((kv1 < nnzv1 && kv2 < nnzv2) && v1Indices(kv1) == v2Indices(kv2)) { score = v1Values(kv1) - v2Values(kv2) - squaredDistance += score * score + kv1 += 1 kv2 += 1 } - kv1 += 1 + squaredDistance += score * score } + // The following two cases are used to handle dense and approximately dense vectors case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 => squaredDistance = vectorSquaredDistance(v1, v2) @@ -308,10 +305,10 @@ object MLUtils { squaredDistance = vectorSquaredDistance(v2, v1) case (v1, v2) => - squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0)((distance, elems) => { + squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){(distance, elems) => val score = elems._1 - elems._2 distance + score * score - }) + } } squaredDistance } @@ -320,29 +317,26 @@ object MLUtils { * Returns the squared distance between DenseVector and SparseVector. */ private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = { - var squaredDistance = 0.0 - var count = 0 - var indices = v1.indices - var score = 0.0 - while (count < indices.length) { - val idx = indices(count) - score = v1(idx) - v2(idx) - squaredDistance += score * score - count += 1 - } - var kv1 = 0 var kv2 = 0 + var indices = v1.indices + var squaredDistance = 0.0 var iv1 = indices(kv1) val nnzv2 = v2.size + while (kv2 < nnzv2) { + var score = 0.0 if (kv2 < iv1 || kv2 > iv1) { score = v2(kv2) squaredDistance += score * score } - if (kv2 == iv1 && kv1 < indices.length - 1) { - kv1 += 1 - iv1 = indices(kv1) + if (kv2 == iv1 && kv1 < indices.length) { + score = v1.values(iv1) - v2(kv2) + squaredDistance += score * score + if (kv1 < indices.length - 1) { + kv1 += 1 + iv1 = indices(kv1) + } } kv2 += 1 } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala index 304a3e8b7a59d..640041b7bcf8d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala @@ -64,6 +64,14 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext { val fastSquaredDist3 = fastSquaredDistance(v2, norm2, v3, norm3, precision) assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m") + if (m > 10) { + val v4 = Vectors.sparse(n, indices.slice(0, m - 10), indices.map(i => a(i) + 0.5).slice(0, m - 10)) + val norm4 = Vectors.norm(v4, 2.0) + val squaredDist = breezeSquaredDistance(v2.toBreeze, v4.toBreeze) + val fastSquaredDist = + fastSquaredDistance(v2, norm2, v4, norm4, precision) + assert((fastSquaredDist - squaredDist) <= precision * squaredDist, s"failed with m = $m") + } } } From 44a65adc6c6c7c07969dc777b0aa681abe769a9e Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 17 Dec 2014 14:26:25 +0800 Subject: [PATCH 07/10] Modified for comments. --- .../org/apache/spark/mllib/util/MLUtils.scala | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 84b87b5c78d28..cfc5151b7d102 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -289,7 +289,7 @@ object MLUtils { } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) { score = v2Values(kv2) kv2 += 1 - } else if ((kv1 < nnzv1 && kv2 < nnzv2) && v1Indices(kv1) == v2Indices(kv2)) { + } else if (v1Indices(kv1) == v2Indices(kv2)) { score = v1Values(kv1) - v2Values(kv2) kv1 += 1 kv2 += 1 @@ -297,15 +297,15 @@ object MLUtils { squaredDistance += score * score } - // The following two cases are used to handle dense and approximately dense vectors case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 => squaredDistance = vectorSquaredDistance(v1, v2) case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 => squaredDistance = vectorSquaredDistance(v2, v1) + // When a SparseVector is approximately dense, we treat it as a DenseVector case (v1, v2) => - squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){(distance, elems) => + squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) => val score = elems._1 - elems._2 distance + score * score } @@ -319,25 +319,23 @@ object MLUtils { private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = { var kv1 = 0 var kv2 = 0 - var indices = v1.indices + val indices = v1.indices var squaredDistance = 0.0 var iv1 = indices(kv1) val nnzv2 = v2.size while (kv2 < nnzv2) { var score = 0.0 - if (kv2 < iv1 || kv2 > iv1) { + if (kv2 != iv1) { score = v2(kv2) - squaredDistance += score * score - } - if (kv2 == iv1 && kv1 < indices.length) { + } else { score = v1.values(iv1) - v2(kv2) - squaredDistance += score * score if (kv1 < indices.length - 1) { kv1 += 1 iv1 = indices(kv1) } } + squaredDistance += score * score kv2 += 1 } squaredDistance From 91849d0b3fc474aee1049683a440d7b97118f978 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 18 Dec 2014 14:48:46 +0800 Subject: [PATCH 08/10] Modified for comment. --- mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +- .../test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index cfc5151b7d102..4af3e4395cc47 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -289,7 +289,7 @@ object MLUtils { } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) { score = v2Values(kv2) kv2 += 1 - } else if (v1Indices(kv1) == v2Indices(kv2)) { + } else { score = v1Values(kv1) - v2Values(kv2) kv1 += 1 kv2 += 1 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala index 640041b7bcf8d..7778847f8b72a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala @@ -65,7 +65,8 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext { fastSquaredDistance(v2, norm2, v3, norm3, precision) assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m") if (m > 10) { - val v4 = Vectors.sparse(n, indices.slice(0, m - 10), indices.map(i => a(i) + 0.5).slice(0, m - 10)) + val v4 = Vectors.sparse(n, indices.slice(0, m - 10), + indices.map(i => a(i) + 0.5).slice(0, m - 10)) val norm4 = Vectors.norm(v4, 2.0) val squaredDist = breezeSquaredDistance(v2.toBreeze, v4.toBreeze) val fastSquaredDist = From ba34422f940abf352e705e2d466690c6568a88d0 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 23 Dec 2014 18:28:18 +0800 Subject: [PATCH 09/10] Fix bug. --- .../src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 4af3e4395cc47..81f51c883baf0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -323,13 +323,13 @@ object MLUtils { var squaredDistance = 0.0 var iv1 = indices(kv1) val nnzv2 = v2.size - + while (kv2 < nnzv2) { var score = 0.0 if (kv2 != iv1) { score = v2(kv2) } else { - score = v1.values(iv1) - v2(kv2) + score = v1.values(kv1) - v2(kv2) if (kv1 < indices.length - 1) { kv1 += 1 iv1 = indices(kv1) From f28b275e153b3d093bf063c53efe1dea91084918 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 31 Dec 2014 18:28:03 +0800 Subject: [PATCH 10/10] Move the implementation to linalg.Vectors and rename as sqdist. --- .../apache/spark/mllib/linalg/Vectors.scala | 80 ++++++++++++++++++ .../org/apache/spark/mllib/util/MLUtils.scala | 82 +------------------ 2 files changed, 82 insertions(+), 80 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index 01f3f90577142..6a782b079aac3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -312,6 +312,86 @@ object Vectors { math.pow(sum, 1.0 / p) } } + + /** + * Returns the squared distance between two Vectors. + * @param v1 first Vector. + * @param v2 second Vector. + * @return squared distance between two Vectors. + */ + def sqdist(v1: Vector, v2: Vector): Double = { + var squaredDistance = 0.0 + (v1, v2) match { + case (v1: SparseVector, v2: SparseVector) => + val v1Values = v1.values + val v1Indices = v1.indices + val v2Values = v2.values + val v2Indices = v2.indices + val nnzv1 = v1Indices.size + val nnzv2 = v2Indices.size + + var kv1 = 0 + var kv2 = 0 + while (kv1 < nnzv1 || kv2 < nnzv2) { + var score = 0.0 + + if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) { + score = v1Values(kv1) + kv1 += 1 + } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) { + score = v2Values(kv2) + kv2 += 1 + } else { + score = v1Values(kv1) - v2Values(kv2) + kv1 += 1 + kv2 += 1 + } + squaredDistance += score * score + } + + case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 => + squaredDistance = sqdist(v1, v2) + + case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 => + squaredDistance = sqdist(v2, v1) + + // When a SparseVector is approximately dense, we treat it as a DenseVector + case (v1, v2) => + squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) => + val score = elems._1 - elems._2 + distance + score * score + } + } + squaredDistance + } + + /** + * Returns the squared distance between DenseVector and SparseVector. + */ + private[mllib] def sqdist(v1: SparseVector, v2: DenseVector): Double = { + var kv1 = 0 + var kv2 = 0 + val indices = v1.indices + var squaredDistance = 0.0 + var iv1 = indices(kv1) + val nnzv2 = v2.size + + while (kv2 < nnzv2) { + var score = 0.0 + if (kv2 != iv1) { + score = v2(kv2) + } else { + score = v1.values(kv1) - v2(kv2) + if (kv1 < indices.length - 1) { + kv1 += 1 + iv1 = indices(kv1) + } + } + squaredDistance += score * score + kv2 += 1 + } + squaredDistance + } } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index f0b9e1eea7bb4..c7843464a7505 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -266,84 +266,6 @@ object MLUtils { Vectors.fromBreeze(vector1) } - /** - * Returns the squared distance between two Vectors. - */ - private[util] def vectorSquaredDistance(v1: Vector, v2: Vector): Double = { - var squaredDistance = 0.0 - (v1, v2) match { - case (v1: SparseVector, v2: SparseVector) => - val v1Values = v1.values - val v1Indices = v1.indices - val v2Values = v2.values - val v2Indices = v2.indices - val nnzv1 = v1Indices.size - val nnzv2 = v2Indices.size - - var kv1 = 0 - var kv2 = 0 - while (kv1 < nnzv1 || kv2 < nnzv2) { - var score = 0.0 - - if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) { - score = v1Values(kv1) - kv1 += 1 - } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) { - score = v2Values(kv2) - kv2 += 1 - } else { - score = v1Values(kv1) - v2Values(kv2) - kv1 += 1 - kv2 += 1 - } - squaredDistance += score * score - } - - case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 => - squaredDistance = vectorSquaredDistance(v1, v2) - - case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 => - squaredDistance = vectorSquaredDistance(v2, v1) - - // When a SparseVector is approximately dense, we treat it as a DenseVector - case (v1, v2) => - squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) => - val score = elems._1 - elems._2 - distance + score * score - } - } - squaredDistance - } - - /** - * Returns the squared distance between DenseVector and SparseVector. - */ - private[util] def vectorSquaredDistance(v1: SparseVector, v2: DenseVector): Double = { - var kv1 = 0 - var kv2 = 0 - val indices = v1.indices - var squaredDistance = 0.0 - var iv1 = indices(kv1) - val nnzv2 = v2.size - - while (kv2 < nnzv2) { - var score = 0.0 - if (kv2 != iv1) { - score = v2(kv2) - } else { - score = v1.values(kv1) - v2(kv2) - if (kv1 < indices.length - 1) { - kv1 += 1 - iv1 = indices(kv1) - } - } - squaredDistance += score * score - kv2 += 1 - } - squaredDistance - } - - /** * Returns the squared Euclidean distance between two vectors. The following formula will be used * if it does not introduce too much numerical error: @@ -393,10 +315,10 @@ object MLUtils { val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) / (sqDist + EPSILON) if (precisionBound2 > precision) { - sqDist = vectorSquaredDistance(v1, v2) + sqDist = Vectors.sqdist(v1, v2) } } else { - sqDist = vectorSquaredDistance(v1, v2) + sqDist = Vectors.sqdist(v1, v2) } sqDist }