From 35974c9bd813c28e591197bcf4d069c70b834fe4 Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Thu, 30 Jun 2016 15:47:56 +0200 Subject: [PATCH 1/3] Add 'asML' and 'fromML' conversion methods to PySpark linalg --- python/pyspark/mllib/linalg/__init__.py | 161 ++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 3a345b2b5638..e8df0fe3f588 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -39,6 +39,7 @@ import numpy as np from pyspark import since +from pyspark.ml import linalg as newlinalg from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \ IntegerType, ByteType, BooleanType @@ -247,6 +248,15 @@ def toArray(self): """ raise NotImplementedError + def asML(self): + """ + Convert this vector to the new mllib-local representation. + This does NOT copy the data; it copies references. + + :return: :py:class:`pyspark.ml.linalg.Vector` + """ + raise NotImplementedError + class DenseVector(Vector): """ @@ -408,6 +418,23 @@ def toArray(self): """ return self.array + def asML(self): + """ + Convert this vector to the new mllib-local representation. + This does NOT copy the data; it copies references. + + >>> mllibDV = Vectors.dense([1, 2, 3]) + >>> mlDV1 = newlinalg.Vectors.dense([1, 2, 3]) + >>> mlDV2 = mllibDV.asML() + >>> mlDV2 == mlDV1 + True + + :return: :py:class:`pyspark.ml.linalg.DenseVector` + + .. versionadded:: 2.0.0 + """ + return newlinalg.DenseVector(self.array) + @property def values(self): """ @@ -737,6 +764,23 @@ def toArray(self): arr[self.indices] = self.values return arr + def asML(self): + """ + Convert this vector to the new mllib-local representation. + This does NOT copy the data; it copies references. + + >>> mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5}) + >>> mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) + >>> mlSV2 = mllibSV.asML() + >>> mlSV2 == mlSV1 + True + + :return: :py:class:`pyspark.ml.linalg.SparseVector` + + .. versionadded:: 2.0.0 + """ + return newlinalg.SparseVector(self.size, self.indices, self.values) + def __len__(self): return self.size @@ -845,6 +889,33 @@ def dense(*elements): elements = elements[0] return DenseVector(elements) + @staticmethod + def fromML(vec): + """ + Convert a vector from the new mllib-local representation. + This does NOT copy the data; it copies references. + + >>> mllibDV1 = Vectors.dense([1, 2, 3]) + >>> mlDV = newlinalg.Vectors.dense([1, 2, 3]) + >>> mllibDV2 = Vectors.fromML(mlDV) + >>> mllibDV1 == mllibDV2 + True + >>> mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5}) + >>> mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) + >>> mllibSV2 = Vectors.fromML(mlSV) + >>> mllibSV1 == mllibSV2 + True + + :param vec: a :py:class:`pyspark.ml.linalg.Vector` + :return: a :py:class:`pyspark.mllib.linalg.Vector` + """ + if type(vec) == newlinalg.DenseVector: + return DenseVector(vec.array) + elif type(vec) == newlinalg.SparseVector: + return SparseVector(vec.size, vec.indices, vec.values) + else: + raise TypeError("Unsupported vector type %s" % type(vec)) + @staticmethod def stringify(vector): """ @@ -945,6 +1016,13 @@ def toArray(self): """ raise NotImplementedError + def asML(self): + """ + Convert this matrix to the new mllib-local representation. + This does NOT copy the data; it copies references. + """ + raise NotImplementedError + @staticmethod def _convert_to_array(array_like, dtype): """ @@ -1044,6 +1122,28 @@ def toSparse(self): return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values) + def asML(self): + """ + Convert this matrix to the new mllib-local representation. + This does NOT copy the data; it copies references. + + >>> mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) + >>> mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) + >>> mlDM2 = mllibDM.asML() + >>> mlDM2 == mlDM1 + True + >>> mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) + >>> mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) + >>> mlDMt2 = mllibDMt.asML() + >>> mlDMt2 == mlDMt1 + True + + :return: :py:class:`pyspark.ml.linalg.DenseMatrix` + + .. versionadded:: 2.0.0 + """ + return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed) + def __getitem__(self, indices): i, j = indices if i < 0 or i >= self.numRows: @@ -1216,6 +1316,29 @@ def toDense(self): densevals = np.ravel(self.toArray(), order='F') return DenseMatrix(self.numRows, self.numCols, densevals) + def asML(self): + """ + Convert this matrix to the new mllib-local representation. + This does NOT copy the data; it copies references. + + >>> mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + >>> mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + >>> mlSM2 = mllibSM.asML() + >>> mlSM2 == mlSM1 + True + >>> mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + >>> mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + >>> mlSMt2 = mllibSMt.asML() + >>> mlSMt2 == mlSMt1 + True + + :return: :py:class:`pyspark.ml.linalg.SparseMatrix` + + .. versionadded:: 2.0.0 + """ + return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices, + self.values, self.isTransposed) + # TODO: More efficient implementation: def __eq__(self, other): return np.all(self.toArray() == other.toArray()) @@ -1236,6 +1359,44 @@ def sparse(numRows, numCols, colPtrs, rowIndices, values): """ return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values) + @staticmethod + def fromML(mat): + """ + Convert a matrix from the new mllib-local representation. + This does NOT copy the data; it copies references. + + >>> mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) + >>> mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) + >>> mllibDM2 = Matrices.fromML(mlDM) + >>> mllibDM1 == mllibDM2 + True + >>> mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) + >>> mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) + >>> mllibDMt2 = Matrices.fromML(mlDMt) + >>> mllibDMt1 == mllibDMt2 + True + >>> mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + >>> mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + >>> mllibSM2 = Matrices.fromML(mlSM) + >>> mllibSM1 == mllibSM2 + True + >>> mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + >>> mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + >>> mllibSMt2 = Matrices.fromML(mlSMt) + >>> mllibSMt1 == mllibSMt2 + True + + :param vec: a :py:class:`pyspark.ml.linalg.Matrix` + :return: a :py:class:`pyspark.mllib.linalg.Matrix` + """ + if type(mat) == newlinalg.DenseMatrix: + return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed) + elif type(mat) == newlinalg.SparseMatrix: + return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices, + mat.values, mat.isTransposed) + else: + raise TypeError("Unsupported matrix type %s" % type(mat)) + class QRDecomposition(object): """ From a1804213bd1cb9b26e2693826e44d548341b8942 Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Thu, 30 Jun 2016 20:31:40 +0200 Subject: [PATCH 2/3] use isinstance --- python/pyspark/mllib/linalg/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index e8df0fe3f588..10abc5373873 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -909,9 +909,9 @@ def fromML(vec): :param vec: a :py:class:`pyspark.ml.linalg.Vector` :return: a :py:class:`pyspark.mllib.linalg.Vector` """ - if type(vec) == newlinalg.DenseVector: + if isinstance(vec, newlinalg.DenseVector): return DenseVector(vec.array) - elif type(vec) == newlinalg.SparseVector: + elif isinstance(vec, newlinalg.SparseVector): return SparseVector(vec.size, vec.indices, vec.values) else: raise TypeError("Unsupported vector type %s" % type(vec)) @@ -1389,9 +1389,9 @@ def fromML(mat): :param vec: a :py:class:`pyspark.ml.linalg.Matrix` :return: a :py:class:`pyspark.mllib.linalg.Matrix` """ - if type(mat) == newlinalg.DenseMatrix: + if isinstance(mat, newlinalg.DenseMatrix): return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed) - elif type(mat) == newlinalg.SparseMatrix: + elif isinstance(mat, newlinalg.SparseMatrix): return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices, mat.values, mat.isTransposed) else: From 05ff5274c3562d4c5994960b835af88c836d6c8a Mon Sep 17 00:00:00 2001 From: Nick Pentreath Date: Thu, 30 Jun 2016 23:45:22 +0200 Subject: [PATCH 3/3] versionadded and move tests to tests.py --- python/pyspark/mllib/linalg/__init__.py | 72 ++----------------------- python/pyspark/mllib/tests.py | 69 ++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 67 deletions(-) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index 10abc5373873..15dc53a959d6 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -423,12 +423,6 @@ def asML(self): Convert this vector to the new mllib-local representation. This does NOT copy the data; it copies references. - >>> mllibDV = Vectors.dense([1, 2, 3]) - >>> mlDV1 = newlinalg.Vectors.dense([1, 2, 3]) - >>> mlDV2 = mllibDV.asML() - >>> mlDV2 == mlDV1 - True - :return: :py:class:`pyspark.ml.linalg.DenseVector` .. versionadded:: 2.0.0 @@ -769,12 +763,6 @@ def asML(self): Convert this vector to the new mllib-local representation. This does NOT copy the data; it copies references. - >>> mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5}) - >>> mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) - >>> mlSV2 = mllibSV.asML() - >>> mlSV2 == mlSV1 - True - :return: :py:class:`pyspark.ml.linalg.SparseVector` .. versionadded:: 2.0.0 @@ -895,19 +883,10 @@ def fromML(vec): Convert a vector from the new mllib-local representation. This does NOT copy the data; it copies references. - >>> mllibDV1 = Vectors.dense([1, 2, 3]) - >>> mlDV = newlinalg.Vectors.dense([1, 2, 3]) - >>> mllibDV2 = Vectors.fromML(mlDV) - >>> mllibDV1 == mllibDV2 - True - >>> mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5}) - >>> mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) - >>> mllibSV2 = Vectors.fromML(mlSV) - >>> mllibSV1 == mllibSV2 - True - :param vec: a :py:class:`pyspark.ml.linalg.Vector` :return: a :py:class:`pyspark.mllib.linalg.Vector` + + .. versionadded:: 2.0.0 """ if isinstance(vec, newlinalg.DenseVector): return DenseVector(vec.array) @@ -1127,17 +1106,6 @@ def asML(self): Convert this matrix to the new mllib-local representation. This does NOT copy the data; it copies references. - >>> mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) - >>> mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) - >>> mlDM2 = mllibDM.asML() - >>> mlDM2 == mlDM1 - True - >>> mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) - >>> mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) - >>> mlDMt2 = mllibDMt.asML() - >>> mlDMt2 == mlDMt1 - True - :return: :py:class:`pyspark.ml.linalg.DenseMatrix` .. versionadded:: 2.0.0 @@ -1321,17 +1289,6 @@ def asML(self): Convert this matrix to the new mllib-local representation. This does NOT copy the data; it copies references. - >>> mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - >>> mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - >>> mlSM2 = mllibSM.asML() - >>> mlSM2 == mlSM1 - True - >>> mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - >>> mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - >>> mlSMt2 = mllibSMt.asML() - >>> mlSMt2 == mlSMt1 - True - :return: :py:class:`pyspark.ml.linalg.SparseMatrix` .. versionadded:: 2.0.0 @@ -1365,29 +1322,10 @@ def fromML(mat): Convert a matrix from the new mllib-local representation. This does NOT copy the data; it copies references. - >>> mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) - >>> mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) - >>> mllibDM2 = Matrices.fromML(mlDM) - >>> mllibDM1 == mllibDM2 - True - >>> mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) - >>> mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) - >>> mllibDMt2 = Matrices.fromML(mlDMt) - >>> mllibDMt1 == mllibDMt2 - True - >>> mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - >>> mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) - >>> mllibSM2 = Matrices.fromML(mlSM) - >>> mllibSM1 == mllibSM2 - True - >>> mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - >>> mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) - >>> mllibSMt2 = Matrices.fromML(mlSMt) - >>> mllibSMt1 == mllibSMt2 - True - - :param vec: a :py:class:`pyspark.ml.linalg.Matrix` + :param mat: a :py:class:`pyspark.ml.linalg.Matrix` :return: a :py:class:`pyspark.mllib.linalg.Matrix` + + .. versionadded:: 2.0.0 """ if isinstance(mat, newlinalg.DenseMatrix): return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 74cf7bb8eaf9..72fa8b5f3d47 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -49,6 +49,7 @@ import unittest from pyspark import SparkContext +import pyspark.ml.linalg as newlinalg from pyspark.mllib.common import _to_java_object_rdd from pyspark.mllib.clustering import StreamingKMeans, StreamingKMeansModel from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector,\ @@ -423,6 +424,74 @@ def test_norms(self): tmp = SparseVector(4, [0, 2], [3, 0]) self.assertEqual(tmp.numNonzeros(), 1) + def test_ml_mllib_vector_conversion(self): + # to ml + # dense + mllibDV = Vectors.dense([1, 2, 3]) + mlDV1 = newlinalg.Vectors.dense([1, 2, 3]) + mlDV2 = mllibDV.asML() + self.assertEqual(mlDV2, mlDV1) + # sparse + mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5}) + mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) + mlSV2 = mllibSV.asML() + self.assertEqual(mlSV2, mlSV1) + # from ml + # dense + mllibDV1 = Vectors.dense([1, 2, 3]) + mlDV = newlinalg.Vectors.dense([1, 2, 3]) + mllibDV2 = Vectors.fromML(mlDV) + self.assertEqual(mllibDV1, mllibDV2) + # sparse + mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5}) + mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5}) + mllibSV2 = Vectors.fromML(mlSV) + self.assertEqual(mllibSV1, mllibSV2) + + def test_ml_mllib_matrix_conversion(self): + # to ml + # dense + mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) + mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) + mlDM2 = mllibDM.asML() + self.assertEqual(mlDM2, mlDM1) + # transposed + mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) + mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) + mlDMt2 = mllibDMt.asML() + self.assertEqual(mlDMt2, mlDMt1) + # sparse + mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + mlSM2 = mllibSM.asML() + self.assertEqual(mlSM2, mlSM1) + # transposed + mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + mlSMt2 = mllibSMt.asML() + self.assertEqual(mlSMt2, mlSMt1) + # from ml + # dense + mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) + mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) + mllibDM2 = Matrices.fromML(mlDM) + self.assertEqual(mllibDM1, mllibDM2) + # transposed + mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) + mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) + mllibDMt2 = Matrices.fromML(mlDMt) + self.assertEqual(mllibDMt1, mllibDMt2) + # sparse + mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + mllibSM2 = Matrices.fromML(mlSM) + self.assertEqual(mllibSM1, mllibSM2) + # transposed + mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + mllibSMt2 = Matrices.fromML(mlSMt) + self.assertEqual(mllibSMt1, mllibSMt2) + class ListTests(MLlibTestCase):