From 1e9d1bca4fa0047139092d1a0d042599f2b177e0 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 13 Aug 2015 22:42:34 +0800
Subject: [PATCH 1/7] PySpark DenseVector, SparseVector __eq__ should use
 semantics

---
 python/pyspark/mllib/linalg/__init__.py | 71 +++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 334dc8e38bb8f..a693f4883f460 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -404,7 +404,26 @@ def __repr__(self):
         return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
 
     def __eq__(self, other):
-        return isinstance(other, DenseVector) and np.array_equal(self.array, other.array)
+        """
+        Test DenseVector for equality.
+
+        >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        >>> v1 == v2
+        True
+        >>> v1 != v2
+        False
+        >>> v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        >>> v1 == v3
+        True
+        """
+        if isinstance(other, DenseVector):
+            return np.array_equal(self.array, other.array)
+        elif isinstance(other, SparseVector):
+            if len(self) != other.size:
+                return false
+            return Vectors.equals(list(xrange(len(self))), self.array, other.indices, other.values)
+        return NotImplemented
 
     def __ne__(self, other):
         return not self == other
@@ -713,11 +732,18 @@ def __eq__(self, other):
         True
         >>> v1 != v2
         False
+        >>> v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        >>> v1 == v3
+        True
         """
-        return (isinstance(other, self.__class__)
-                and other.size == self.size
-                and np.array_equal(other.indices, self.indices)
-                and np.array_equal(other.values, self.values))
+        if isinstance(other, SparseVector):
+            return other.size == self.size and np.array_equal(other.indices, self.indices) \
+                and np.array_equal(other.values, self.values)
+        elif isinstance(other, DenseVector):
+            if self.size != len(other):
+                return false
+            return Vectors.equals(self.indices, self.values, list(xrange(len(other))), other.array)
+        return NotImplemented
 
     def __getitem__(self, index):
         inds = self.indices
@@ -841,6 +867,41 @@ def parse(s):
     def zeros(size):
         return DenseVector(np.zeros(size))
 
+    @staticmethod
+    def equals(v1_indices, v1_values, v2_indices, v2_values):
+        """
+        Check equality between sparse/dense vectors
+
+        >>> indices = [1, 2, 4]
+        >>> values = [1., 3., 2.]
+        >>> Vectors.equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.])
+        True
+        >>> Vectors.equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.])
+        False
+        >>> Vectors.equals(indices, values, list(range(5)), [0., 3., 0., 2.])
+        False
+        >>> Vectors.equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.])
+        False
+        """
+        v1_size = len(v1_values)
+        v2_size = len(v2_values)
+        k1 = 0
+        k2 = 0
+        all_equal = True
+        while all_equal:
+            while k1 < v1_size and v1_values[k1] == 0:
+                k1 += 1
+            while k2 < v2_size and v2_values[k2] == 0:
+                k2 += 1
+
+            if k1 >= v1_size or k2 >= v2_size:
+                return k1 >= v1_size and k2 >= v2_size
+
+            all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]
+            k1 += 1
+            k2 += 1
+        return all_equal
+
 
 class Matrix(object):
 

From 7489a440d0441a87cc5aa4618788d7918f03486a Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 15 Aug 2015 16:46:21 +0800
Subject: [PATCH 2/7] PySpark DenseVector, SparseVector implement __hash__

---
 python/pyspark/mllib/linalg/__init__.py | 63 +++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index a693f4883f460..2ce1525572a35 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -25,6 +25,7 @@
 
 import sys
 import array
+import struct
 
 if sys.version >= '3':
     basestring = str
@@ -122,6 +123,15 @@ def _format_float_list(l):
     return [_format_float(x) for x in l]
 
 
+def _double_to_long_bits(value):
+    if value != value:
+        # value is NaN, standardize to canonical non-signaling NaN
+        return 0x7ff8000000000000
+    else:
+        # pack double into 64 bits, then unpack as long int
+        return struct.unpack('Q', struct.pack('d', value))[0]
+
+
 class VectorUDT(UserDefinedType):
     """
     SQL user-defined type (UDT) for Vector.
@@ -428,6 +438,33 @@ def __eq__(self, other):
     def __ne__(self, other):
         return not self == other
 
+    def __hash__(self):
+        """
+        Compute hashcode
+        >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        >>> hash(v1) == hash(v2)
+        True
+        >>> v2 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        >>> hash(v1) == hash(v2)
+        True
+        >>> v2 = DenseVector([1.0, 1.0, 0.0, 5.5])
+        >>> hash(v1) == hash(v2)
+        False
+        """
+        size = len(self)
+        result = 31 + size
+        count = 0
+        i = 0
+        while i < size and count < 16:
+            if self.array[i] != 0:
+                bits = _double_to_long_bits(self.array[i] + i)
+                result = 31 * result + (bits ^ (bits >> 32))
+
+            count += 1
+            i += 1
+        return result
+
     def __getattr__(self, item):
         return getattr(self.array, item)
 
@@ -765,6 +802,32 @@ def __getitem__(self, index):
     def __ne__(self, other):
         return not self.__eq__(other)
 
+    def __hash__(self):
+        """
+        Compute hashcode
+        >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        >>> hash(v1) == hash(v2)
+        True
+        >>> v2 = SparseVector(4, [(1, 1.0), (3, 2.5)])
+        >>> hash(v1) == hash(v2)
+        False
+        >>> v2 = SparseVector(4, [(2, 1.0), (3, 5.5)])
+        >>> hash(v1) == hash(v2)
+        False
+        """
+        result = 31 + self.size
+        count = 0
+        i = 0
+        while i < len(self.values) and count < 16:
+            if self.values[i] != 0:
+                bits = _double_to_long_bits(self.values[i] + self.indices[i])
+                result = 31 * result + (bits ^ (bits >> 32))
+
+            count += 1
+            i += 1
+        return result
+
 
 class Vectors(object):
 

From 83f51edffadc0ff8f8daea90af71e41158801520 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 27 Aug 2015 11:17:35 +0800
Subject: [PATCH 3/7] document the indices must be strictly increasing

---
 python/pyspark/mllib/linalg/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 2ce1525572a35..c0a603cb5d5c0 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -933,7 +933,8 @@ def zeros(size):
     @staticmethod
     def equals(v1_indices, v1_values, v2_indices, v2_values):
         """
-        Check equality between sparse/dense vectors
+        Check equality between sparse/dense vectors,
+        v1_indices and v2_indices assume to be strictly increasing.
 
         >>> indices = [1, 2, 4]
         >>> values = [1., 3., 2.]

From fca0f5ab7e410df1717725e1f6e76e7e43d78baf Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 14 Sep 2015 17:00:42 +0800
Subject: [PATCH 4/7] use the first 128 nonzeros entries to compute hash for
 PySpark Vector

---
 python/pyspark/mllib/linalg/__init__.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index c0a603cb5d5c0..a8f00044275d5 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -441,6 +441,7 @@ def __ne__(self, other):
     def __hash__(self):
         """
         Compute hashcode
+
         >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
         >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
         >>> hash(v1) == hash(v2)
@@ -454,14 +455,14 @@ def __hash__(self):
         """
         size = len(self)
         result = 31 + size
-        count = 0
+        nnz = 0
         i = 0
-        while i < size and count < 16:
+        while i < size and nnz < 128:
             if self.array[i] != 0:
-                bits = _double_to_long_bits(self.array[i] + i)
+                result = 31 * result + i
+                bits = _double_to_long_bits(self.array[i])
                 result = 31 * result + (bits ^ (bits >> 32))
-
-            count += 1
+                nnz += 1
             i += 1
         return result
 
@@ -805,6 +806,7 @@ def __ne__(self, other):
     def __hash__(self):
         """
         Compute hashcode
+
         >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)])
         >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
         >>> hash(v1) == hash(v2)
@@ -817,14 +819,14 @@ def __hash__(self):
         False
         """
         result = 31 + self.size
-        count = 0
+        nnz = 0
         i = 0
-        while i < len(self.values) and count < 16:
+        while i < len(self.values) and nnz < 128:
             if self.values[i] != 0:
-                bits = _double_to_long_bits(self.values[i] + self.indices[i])
+                result = 31 * result + int(self.indices[i])
+                bits = _double_to_long_bits(self.values[i])
                 result = 31 * result + (bits ^ (bits >> 32))
-
-            count += 1
+                nnz += 1
             i += 1
         return result
 

From d3f8c14242506f91160a8c345291b8a1ce891d12 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 14 Sep 2015 17:48:45 +0800
Subject: [PATCH 5/7] move the test to tests.py

---
 python/pyspark/mllib/linalg/__init__.py | 54 -------------------------
 python/pyspark/mllib/tests.py           | 23 +++++++++++
 2 files changed, 23 insertions(+), 54 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index a8f00044275d5..d36084641da20 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -414,19 +414,6 @@ def __repr__(self):
         return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
 
     def __eq__(self, other):
-        """
-        Test DenseVector for equality.
-
-        >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
-        >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> v1 == v2
-        True
-        >>> v1 != v2
-        False
-        >>> v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
-        >>> v1 == v3
-        True
-        """
         if isinstance(other, DenseVector):
             return np.array_equal(self.array, other.array)
         elif isinstance(other, SparseVector):
@@ -439,20 +426,6 @@ def __ne__(self, other):
         return not self == other
 
     def __hash__(self):
-        """
-        Compute hashcode
-
-        >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
-        >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> hash(v1) == hash(v2)
-        True
-        >>> v2 = DenseVector([0.0, 1.0, 0.0, 5.5])
-        >>> hash(v1) == hash(v2)
-        True
-        >>> v2 = DenseVector([1.0, 1.0, 0.0, 5.5])
-        >>> hash(v1) == hash(v2)
-        False
-        """
         size = len(self)
         result = 31 + size
         nnz = 0
@@ -761,19 +734,6 @@ def __repr__(self):
         return "SparseVector({0}, {{{1}}})".format(self.size, entries)
 
     def __eq__(self, other):
-        """
-        Test SparseVectors for equality.
-
-        >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> v1 == v2
-        True
-        >>> v1 != v2
-        False
-        >>> v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
-        >>> v1 == v3
-        True
-        """
         if isinstance(other, SparseVector):
             return other.size == self.size and np.array_equal(other.indices, self.indices) \
                 and np.array_equal(other.values, self.values)
@@ -804,20 +764,6 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
     def __hash__(self):
-        """
-        Compute hashcode
-
-        >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        >>> hash(v1) == hash(v2)
-        True
-        >>> v2 = SparseVector(4, [(1, 1.0), (3, 2.5)])
-        >>> hash(v1) == hash(v2)
-        False
-        >>> v2 = SparseVector(4, [(2, 1.0), (3, 5.5)])
-        >>> hash(v1) == hash(v2)
-        False
-        """
         result = 31 + self.size
         nnz = 0
         i = 0
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 5097c5e8ba4cd..14669880baf34 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -194,6 +194,29 @@ def test_squared_distance(self):
         self.assertEquals(3.0, _squared_distance(sv, arr))
         self.assertEquals(3.0, _squared_distance(sv, narr))
 
+    def test_hash(self):
+        v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        v3 = DenseVector([1.0, 1.0, 0.0, 5.5])
+        v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
+        self.assertTrue(hash(v1) == hash(v2))
+        self.assertFalse(hash(v1) == hash(v3))
+        self.assertFalse(hash(v2) == hash(v3))
+        self.assertFalse(hash(v2) == hash(v4))
+
+    def test_eq(self):
+        v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
+        v4 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
+        v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
+        self.assertTrue(v1 == v2)
+        self.assertTrue(v1 == v3)
+        self.assertTrue(v2 == v4)
+        self.assertFalse(v1 == v5)
+        self.assertFalse(v1 == v6)
+
     def test_conversion(self):
         # numpy arrays should be automatically upcast to float64
         # tests for fix of [SPARK-5089]

From 3b8ac7a82c25a6dfa00465df74e02238f311c861 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 14 Sep 2015 18:07:41 +0800
Subject: [PATCH 6/7] equals only internal used, so rename to _equals

---
 python/pyspark/mllib/linalg/__init__.py | 25 +++++++------------------
 python/pyspark/mllib/tests.py           | 12 ++++++++++--
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index d36084641da20..e0e6f4a481e1c 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -418,9 +418,9 @@ def __eq__(self, other):
             return np.array_equal(self.array, other.array)
         elif isinstance(other, SparseVector):
             if len(self) != other.size:
-                return false
-            return Vectors.equals(list(xrange(len(self))), self.array, other.indices, other.values)
-        return NotImplemented
+                return False
+            return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
+        return False
 
     def __ne__(self, other):
         return not self == other
@@ -739,9 +739,9 @@ def __eq__(self, other):
                 and np.array_equal(other.values, self.values)
         elif isinstance(other, DenseVector):
             if self.size != len(other):
-                return false
-            return Vectors.equals(self.indices, self.values, list(xrange(len(other))), other.array)
-        return NotImplemented
+                return False
+            return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
+        return False
 
     def __getitem__(self, index):
         inds = self.indices
@@ -879,21 +879,10 @@ def zeros(size):
         return DenseVector(np.zeros(size))
 
     @staticmethod
-    def equals(v1_indices, v1_values, v2_indices, v2_values):
+    def _equals(v1_indices, v1_values, v2_indices, v2_values):
         """
         Check equality between sparse/dense vectors,
         v1_indices and v2_indices assume to be strictly increasing.
-
-        >>> indices = [1, 2, 4]
-        >>> values = [1., 3., 2.]
-        >>> Vectors.equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.])
-        True
-        >>> Vectors.equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.])
-        False
-        >>> Vectors.equals(indices, values, list(range(5)), [0., 3., 0., 2.])
-        False
-        >>> Vectors.equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.])
-        False
         """
         v1_size = len(v1_values)
         v2_size = len(v2_values)
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 14669880baf34..00aa3bbffd1b2 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -208,15 +208,23 @@ def test_eq(self):
         v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
         v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
         v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
-        v4 = SparseVector(4, [(1, 1.0), (3, 5.5)])
+        v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
         v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
         v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
         self.assertTrue(v1 == v2)
         self.assertTrue(v1 == v3)
-        self.assertTrue(v2 == v4)
+        self.assertFalse(v2 == v4)
         self.assertFalse(v1 == v5)
         self.assertFalse(v1 == v6)
 
+    def test_equals(self):
+        indices = [1, 2, 4]
+        values = [1., 3., 2.]
+        self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
+        self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
+        self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
+        self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
+
     def test_conversion(self):
         # numpy arrays should be automatically upcast to float64
         # tests for fix of [SPARK-5089]

From b58d1bbe191fb5c42a2d810233cce8ad658f985b Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 15 Sep 2015 10:49:07 +0800
Subject: [PATCH 7/7] make _double_to_long_bits more readable & use assertEqual
 to test equality

---
 python/pyspark/mllib/linalg/__init__.py | 10 ++++------
 python/pyspark/mllib/tests.py           | 13 +++++++------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index e0e6f4a481e1c..380f86e9b44f8 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -124,12 +124,10 @@ def _format_float_list(l):
 
 
 def _double_to_long_bits(value):
-    if value != value:
-        # value is NaN, standardize to canonical non-signaling NaN
-        return 0x7ff8000000000000
-    else:
-        # pack double into 64 bits, then unpack as long int
-        return struct.unpack('Q', struct.pack('d', value))[0]
+    if np.isnan(value):
+        value = float('nan')
+    # pack double into 64 bits, then unpack as long int
+    return struct.unpack('Q', struct.pack('d', value))[0]
 
 
 class VectorUDT(UserDefinedType):
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 00aa3bbffd1b2..636f9a06cab7b 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -197,11 +197,12 @@ def test_squared_distance(self):
     def test_hash(self):
         v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
         v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
-        v3 = DenseVector([1.0, 1.0, 0.0, 5.5])
+        v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
         v4 = SparseVector(4, [(1, 1.0), (3, 2.5)])
-        self.assertTrue(hash(v1) == hash(v2))
-        self.assertFalse(hash(v1) == hash(v3))
-        self.assertFalse(hash(v2) == hash(v3))
+        self.assertEquals(hash(v1), hash(v2))
+        self.assertEquals(hash(v1), hash(v3))
+        self.assertEquals(hash(v2), hash(v3))
+        self.assertFalse(hash(v1) == hash(v4))
         self.assertFalse(hash(v2) == hash(v4))
 
     def test_eq(self):
@@ -211,8 +212,8 @@ def test_eq(self):
         v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
         v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
         v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
-        self.assertTrue(v1 == v2)
-        self.assertTrue(v1 == v3)
+        self.assertEquals(v1, v2)
+        self.assertEquals(v1, v3)
         self.assertFalse(v2 == v4)
         self.assertFalse(v1 == v5)
         self.assertFalse(v1 == v6)