From 1e9d1bca4fa0047139092d1a0d042599f2b177e0 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 13 Aug 2015 22:42:34 +0800 Subject: [PATCH 1/7] PySpark DenseVector, SparseVector __eq__ should use semantics --- python/pyspark/mllib/linalg/__init__.py | 71 +++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 334dc8e38bb8f..a693f4883f460 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -404,7 +404,26 @@ def __repr__(self): return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array)) def __eq__(self, other): - return isinstance(other, DenseVector) and np.array_equal(self.array, other.array) + """ + Test DenseVector for equality. + + >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) + >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) + >>> v1 == v2 + True + >>> v1 != v2 + False + >>> v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) + >>> v1 == v3 + True + """ + if isinstance(other, DenseVector): + return np.array_equal(self.array, other.array) + elif isinstance(other, SparseVector): + if len(self) != other.size: + return false + return Vectors.equals(list(xrange(len(self))), self.array, other.indices, other.values) + return NotImplemented def __ne__(self, other): return not self == other @@ -713,11 +732,18 @@ def __eq__(self, other): True >>> v1 != v2 False + >>> v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) + >>> v1 == v3 + True """ - return (isinstance(other, self.__class__) - and other.size == self.size - and np.array_equal(other.indices, self.indices) - and np.array_equal(other.values, self.values)) + if isinstance(other, SparseVector): + return other.size == self.size and np.array_equal(other.indices, self.indices) \ + and np.array_equal(other.values, self.values) + elif isinstance(other, DenseVector): + if self.size != len(other): + return false + return Vectors.equals(self.indices, self.values, list(xrange(len(other))), other.array) + return NotImplemented def __getitem__(self, index): inds = self.indices @@ -841,6 +867,41 @@ def parse(s): def zeros(size): return DenseVector(np.zeros(size)) + @staticmethod + def equals(v1_indices, v1_values, v2_indices, v2_values): + """ + Check equality between sparse/dense vectors + + >>> indices = [1, 2, 4] + >>> values = [1., 3., 2.] + >>> Vectors.equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]) + True + >>> Vectors.equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]) + False + >>> Vectors.equals(indices, values, list(range(5)), [0., 3., 0., 2.]) + False + >>> Vectors.equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]) + False + """ + v1_size = len(v1_values) + v2_size = len(v2_values) + k1 = 0 + k2 = 0 + all_equal = True + while all_equal: + while k1 < v1_size and v1_values[k1] == 0: + k1 += 1 + while k2 < v2_size and v2_values[k2] == 0: + k2 += 1 + + if k1 >= v1_size or k2 >= v2_size: + return k1 >= v1_size and k2 >= v2_size + + all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2] + k1 += 1 + k2 += 1 + return all_equal + class Matrix(object): From 7489a440d0441a87cc5aa4618788d7918f03486a Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sat, 15 Aug 2015 16:46:21 +0800 Subject: [PATCH 2/7] PySpark DenseVector, SparseVector implement __hash__ --- python/pyspark/mllib/linalg/__init__.py | 63 +++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index a693f4883f460..2ce1525572a35 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -25,6 +25,7 @@ import sys import array +import struct if sys.version >= '3': basestring = str @@ -122,6 +123,15 @@ def _format_float_list(l): return [_format_float(x) for x in l] +def _double_to_long_bits(value): + if value != value: + # value is NaN, standardize to canonical non-signaling NaN + return 0x7ff8000000000000 + else: + # pack double into 64 bits, then unpack as long int + return struct.unpack('Q', struct.pack('d', value))[0] + + class VectorUDT(UserDefinedType): """ SQL user-defined type (UDT) for Vector. @@ -428,6 +438,33 @@ def __eq__(self, other): def __ne__(self, other): return not self == other + def __hash__(self): + """ + Compute hashcode + >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) + >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) + >>> hash(v1) == hash(v2) + True + >>> v2 = DenseVector([0.0, 1.0, 0.0, 5.5]) + >>> hash(v1) == hash(v2) + True + >>> v2 = DenseVector([1.0, 1.0, 0.0, 5.5]) + >>> hash(v1) == hash(v2) + False + """ + size = len(self) + result = 31 + size + count = 0 + i = 0 + while i < size and count < 16: + if self.array[i] != 0: + bits = _double_to_long_bits(self.array[i] + i) + result = 31 * result + (bits ^ (bits >> 32)) + + count += 1 + i += 1 + return result + def __getattr__(self, item): return getattr(self.array, item) @@ -765,6 +802,32 @@ def __getitem__(self, index): def __ne__(self, other): return not self.__eq__(other) + def __hash__(self): + """ + Compute hashcode + >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)]) + >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) + >>> hash(v1) == hash(v2) + True + >>> v2 = SparseVector(4, [(1, 1.0), (3, 2.5)]) + >>> hash(v1) == hash(v2) + False + >>> v2 = SparseVector(4, [(2, 1.0), (3, 5.5)]) + >>> hash(v1) == hash(v2) + False + """ + result = 31 + self.size + count = 0 + i = 0 + while i < len(self.values) and count < 16: + if self.values[i] != 0: + bits = _double_to_long_bits(self.values[i] + self.indices[i]) + result = 31 * result + (bits ^ (bits >> 32)) + + count += 1 + i += 1 + return result + class Vectors(object): From 83f51edffadc0ff8f8daea90af71e41158801520 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 27 Aug 2015 11:17:35 +0800 Subject: [PATCH 3/7] document the indices must be strictly increasing --- python/pyspark/mllib/linalg/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 2ce1525572a35..c0a603cb5d5c0 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -933,7 +933,8 @@ def zeros(size): @staticmethod def equals(v1_indices, v1_values, v2_indices, v2_values): """ - Check equality between sparse/dense vectors + Check equality between sparse/dense vectors, + v1_indices and v2_indices assume to be strictly increasing. >>> indices = [1, 2, 4] >>> values = [1., 3., 2.] From fca0f5ab7e410df1717725e1f6e76e7e43d78baf Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Mon, 14 Sep 2015 17:00:42 +0800 Subject: [PATCH 4/7] use the first 128 nonzeros entries to compute hash for PySpark Vector --- python/pyspark/mllib/linalg/__init__.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index c0a603cb5d5c0..a8f00044275d5 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -441,6 +441,7 @@ def __ne__(self, other): def __hash__(self): """ Compute hashcode + >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) >>> hash(v1) == hash(v2) @@ -454,14 +455,14 @@ def __hash__(self): """ size = len(self) result = 31 + size - count = 0 + nnz = 0 i = 0 - while i < size and count < 16: + while i < size and nnz < 128: if self.array[i] != 0: - bits = _double_to_long_bits(self.array[i] + i) + result = 31 * result + i + bits = _double_to_long_bits(self.array[i]) result = 31 * result + (bits ^ (bits >> 32)) - - count += 1 + nnz += 1 i += 1 return result @@ -805,6 +806,7 @@ def __ne__(self, other): def __hash__(self): """ Compute hashcode + >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)]) >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) >>> hash(v1) == hash(v2) @@ -817,14 +819,14 @@ def __hash__(self): False """ result = 31 + self.size - count = 0 + nnz = 0 i = 0 - while i < len(self.values) and count < 16: + while i < len(self.values) and nnz < 128: if self.values[i] != 0: - bits = _double_to_long_bits(self.values[i] + self.indices[i]) + result = 31 * result + int(self.indices[i]) + bits = _double_to_long_bits(self.values[i]) result = 31 * result + (bits ^ (bits >> 32)) - - count += 1 + nnz += 1 i += 1 return result From d3f8c14242506f91160a8c345291b8a1ce891d12 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Mon, 14 Sep 2015 17:48:45 +0800 Subject: [PATCH 5/7] move the test to tests.py --- python/pyspark/mllib/linalg/__init__.py | 54 ------------------------- python/pyspark/mllib/tests.py | 23 +++++++++++ 2 files changed, 23 insertions(+), 54 deletions(-) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index a8f00044275d5..d36084641da20 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -414,19 +414,6 @@ def __repr__(self): return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array)) def __eq__(self, other): - """ - Test DenseVector for equality. - - >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) - >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - >>> v1 == v2 - True - >>> v1 != v2 - False - >>> v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) - >>> v1 == v3 - True - """ if isinstance(other, DenseVector): return np.array_equal(self.array, other.array) elif isinstance(other, SparseVector): @@ -439,20 +426,6 @@ def __ne__(self, other): return not self == other def __hash__(self): - """ - Compute hashcode - - >>> v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) - >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - >>> hash(v1) == hash(v2) - True - >>> v2 = DenseVector([0.0, 1.0, 0.0, 5.5]) - >>> hash(v1) == hash(v2) - True - >>> v2 = DenseVector([1.0, 1.0, 0.0, 5.5]) - >>> hash(v1) == hash(v2) - False - """ size = len(self) result = 31 + size nnz = 0 @@ -761,19 +734,6 @@ def __repr__(self): return "SparseVector({0}, {{{1}}})".format(self.size, entries) def __eq__(self, other): - """ - Test SparseVectors for equality. - - >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - >>> v1 == v2 - True - >>> v1 != v2 - False - >>> v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) - >>> v1 == v3 - True - """ if isinstance(other, SparseVector): return other.size == self.size and np.array_equal(other.indices, self.indices) \ and np.array_equal(other.values, self.values) @@ -804,20 +764,6 @@ def __ne__(self, other): return not self.__eq__(other) def __hash__(self): - """ - Compute hashcode - - >>> v1 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - >>> v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - >>> hash(v1) == hash(v2) - True - >>> v2 = SparseVector(4, [(1, 1.0), (3, 2.5)]) - >>> hash(v1) == hash(v2) - False - >>> v2 = SparseVector(4, [(2, 1.0), (3, 5.5)]) - >>> hash(v1) == hash(v2) - False - """ result = 31 + self.size nnz = 0 i = 0 diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 5097c5e8ba4cd..14669880baf34 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -194,6 +194,29 @@ def test_squared_distance(self): self.assertEquals(3.0, _squared_distance(sv, arr)) self.assertEquals(3.0, _squared_distance(sv, narr)) + def test_hash(self): + v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) + v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) + v3 = DenseVector([1.0, 1.0, 0.0, 5.5]) + v4 = SparseVector(4, [(1, 1.0), (3, 2.5)]) + self.assertTrue(hash(v1) == hash(v2)) + self.assertFalse(hash(v1) == hash(v3)) + self.assertFalse(hash(v2) == hash(v3)) + self.assertFalse(hash(v2) == hash(v4)) + + def test_eq(self): + v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) + v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) + v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) + v4 = SparseVector(4, [(1, 1.0), (3, 5.5)]) + v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) + v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) + self.assertTrue(v1 == v2) + self.assertTrue(v1 == v3) + self.assertTrue(v2 == v4) + self.assertFalse(v1 == v5) + self.assertFalse(v1 == v6) + def test_conversion(self): # numpy arrays should be automatically upcast to float64 # tests for fix of [SPARK-5089] From 3b8ac7a82c25a6dfa00465df74e02238f311c861 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Mon, 14 Sep 2015 18:07:41 +0800 Subject: [PATCH 6/7] equals only internal used, so rename to _equals --- python/pyspark/mllib/linalg/__init__.py | 25 +++++++------------------ python/pyspark/mllib/tests.py | 12 ++++++++++-- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index d36084641da20..e0e6f4a481e1c 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -418,9 +418,9 @@ def __eq__(self, other): return np.array_equal(self.array, other.array) elif isinstance(other, SparseVector): if len(self) != other.size: - return false - return Vectors.equals(list(xrange(len(self))), self.array, other.indices, other.values) - return NotImplemented + return False + return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values) + return False def __ne__(self, other): return not self == other @@ -739,9 +739,9 @@ def __eq__(self, other): and np.array_equal(other.values, self.values) elif isinstance(other, DenseVector): if self.size != len(other): - return false - return Vectors.equals(self.indices, self.values, list(xrange(len(other))), other.array) - return NotImplemented + return False + return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array) + return False def __getitem__(self, index): inds = self.indices @@ -879,21 +879,10 @@ def zeros(size): return DenseVector(np.zeros(size)) @staticmethod - def equals(v1_indices, v1_values, v2_indices, v2_values): + def _equals(v1_indices, v1_values, v2_indices, v2_values): """ Check equality between sparse/dense vectors, v1_indices and v2_indices assume to be strictly increasing. - - >>> indices = [1, 2, 4] - >>> values = [1., 3., 2.] - >>> Vectors.equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]) - True - >>> Vectors.equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]) - False - >>> Vectors.equals(indices, values, list(range(5)), [0., 3., 0., 2.]) - False - >>> Vectors.equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]) - False """ v1_size = len(v1_values) v2_size = len(v2_values) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 14669880baf34..00aa3bbffd1b2 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -208,15 +208,23 @@ def test_eq(self): v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) - v4 = SparseVector(4, [(1, 1.0), (3, 5.5)]) + v4 = SparseVector(6, [(1, 1.0), (3, 5.5)]) v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) self.assertTrue(v1 == v2) self.assertTrue(v1 == v3) - self.assertTrue(v2 == v4) + self.assertFalse(v2 == v4) self.assertFalse(v1 == v5) self.assertFalse(v1 == v6) + def test_equals(self): + indices = [1, 2, 4] + values = [1., 3., 2.] + self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.])) + self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.])) + self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.])) + self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.])) + def test_conversion(self): # numpy arrays should be automatically upcast to float64 # tests for fix of [SPARK-5089] From b58d1bbe191fb5c42a2d810233cce8ad658f985b Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 15 Sep 2015 10:49:07 +0800 Subject: [PATCH 7/7] make _double_to_long_bits more readable & use assertEqual to test equality --- python/pyspark/mllib/linalg/__init__.py | 10 ++++------ python/pyspark/mllib/tests.py | 13 +++++++------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index e0e6f4a481e1c..380f86e9b44f8 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -124,12 +124,10 @@ def _format_float_list(l): def _double_to_long_bits(value): - if value != value: - # value is NaN, standardize to canonical non-signaling NaN - return 0x7ff8000000000000 - else: - # pack double into 64 bits, then unpack as long int - return struct.unpack('Q', struct.pack('d', value))[0] + if np.isnan(value): + value = float('nan') + # pack double into 64 bits, then unpack as long int + return struct.unpack('Q', struct.pack('d', value))[0] class VectorUDT(UserDefinedType): diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 00aa3bbffd1b2..636f9a06cab7b 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -197,11 +197,12 @@ def test_squared_distance(self): def test_hash(self): v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) - v3 = DenseVector([1.0, 1.0, 0.0, 5.5]) + v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) v4 = SparseVector(4, [(1, 1.0), (3, 2.5)]) - self.assertTrue(hash(v1) == hash(v2)) - self.assertFalse(hash(v1) == hash(v3)) - self.assertFalse(hash(v2) == hash(v3)) + self.assertEquals(hash(v1), hash(v2)) + self.assertEquals(hash(v1), hash(v3)) + self.assertEquals(hash(v2), hash(v3)) + self.assertFalse(hash(v1) == hash(v4)) self.assertFalse(hash(v2) == hash(v4)) def test_eq(self): @@ -211,8 +212,8 @@ def test_eq(self): v4 = SparseVector(6, [(1, 1.0), (3, 5.5)]) v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) - self.assertTrue(v1 == v2) - self.assertTrue(v1 == v3) + self.assertEquals(v1, v2) + self.assertEquals(v1, v3) self.assertFalse(v2 == v4) self.assertFalse(v1 == v5) self.assertFalse(v1 == v6)