Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ sealed trait Matrix extends Serializable {
}

/**
* Column-majored dense matrix.
* Column-major dense matrix.
* The entry values are stored in a single array of doubles with columns listed in sequence.
* For example, the following matrix
* {{{
Expand Down Expand Up @@ -128,7 +128,7 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double])
}

/**
* Column-majored sparse matrix.
* Column-major sparse matrix.
* The entry values are stored in Compressed Sparse Column (CSC) format.
* For example, the following matrix
* {{{
Expand Down Expand Up @@ -207,7 +207,7 @@ class SparseMatrix(
object Matrices {

/**
* Creates a column-majored dense matrix.
* Creates a column-major dense matrix.
*
* @param numRows number of rows
* @param numCols number of columns
Expand All @@ -218,7 +218,7 @@ object Matrices {
}

/**
* Creates a column-majored sparse matrix in Compressed Sparse Column (CSC) format.
* Creates a column-major sparse matrix in Compressed Sparse Column (CSC) format.
*
* @param numRows number of rows
* @param numCols number of columns
Expand Down
150 changes: 121 additions & 29 deletions python/pyspark/mllib/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,41 @@ def _convert_to_vector(l):
raise TypeError("Cannot convert type %s into Vector" % type(l))


def _vector_size(v):
"""
Returns the size of the vector.

>>> _vector_size([1., 2., 3.])
3
>>> _vector_size((1., 2., 3.))
3
>>> _vector_size(array.array('d', [1., 2., 3.]))
3
>>> _vector_size(np.zeros(3))
3
>>> _vector_size(np.zeros((3, 1)))
3
>>> _vector_size(np.zeros((1, 3)))
Traceback (most recent call last):
...
ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
"""
if isinstance(v, Vector):
return len(v)
elif type(v) in (array.array, list, tuple):
return len(v)
elif type(v) == np.ndarray:
if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
return len(v)
else:
raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
elif _have_scipy and scipy.sparse.issparse(v):
assert v.shape[1] == 1, "Expected column vector"
return v.shape[0]
else:
raise TypeError("Cannot treat type %s as a vector" % type(v))


class Vector(object):
"""
Abstract class for DenseVector and SparseVector
Expand All @@ -76,6 +111,9 @@ def toArray(self):


class DenseVector(Vector):
"""
A dense vector represented by a value array.
"""
def __init__(self, ar):
if not isinstance(ar, array.array):
ar = array.array('d', ar)
Expand All @@ -100,15 +138,31 @@ def dot(self, other):
5.0
>>> dense.dot(np.array(range(1, 3)))
5.0
>>> dense.dot([1.,])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
array([ 5., 11.])
>>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
if isinstance(other, SparseVector):
return other.dot(self)
if type(other) == np.ndarray and other.ndim > 1:
assert len(self) == other.shape[0], "dimension mismatch"
return np.dot(self.toArray(), other)
elif _have_scipy and scipy.sparse.issparse(other):
return other.transpose().dot(self.toArray())[0]
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
assert len(self) == other.shape[0], "dimension mismatch"
return other.transpose().dot(self.toArray())
else:
return np.dot(self.toArray(), other)
assert len(self) == _vector_size(other), "dimension mismatch"
if isinstance(other, SparseVector):
return other.dot(self)
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
else:
return np.dot(self.toArray(), other)

def squared_distance(self, other):
"""
Expand All @@ -126,7 +180,16 @@ def squared_distance(self, other):
>>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
>>> dense1.squared_distance(sparse1)
2.0
>>> dense1.squared_distance([1.,])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if isinstance(other, SparseVector):
return other.squared_distance(self)
elif _have_scipy and scipy.sparse.issparse(other):
Expand Down Expand Up @@ -165,12 +228,10 @@ def __getattr__(self, item):


class SparseVector(Vector):

"""
A simple sparse vector class for passing data to MLlib. Users may
alternatively pass SciPy's {scipy.sparse} data types.
"""

def __init__(self, size, *args):
"""
Create a sparse vector, using either a dictionary, a list of
Expand Down Expand Up @@ -222,20 +283,33 @@ def dot(self, other):
0.0
>>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
array([ 22., 22.])
>>> a.dot([1., 2., 3.])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> a.dot(np.array([1., 2.]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> a.dot(DenseVector([1., 2.]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> a.dot(np.zeros((3, 2)))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
if type(other) == np.ndarray:
if other.ndim == 1:
result = 0.0
for i in xrange(len(self.indices)):
result += self.values[i] * other[self.indices[i]]
return result
elif other.ndim == 2:
if other.ndim == 2:
results = [self.dot(other[:, i]) for i in xrange(other.shape[1])]
return np.array(results)
else:
raise Exception("Cannot call dot with %d-dimensional array" % other.ndim)
elif other.ndim > 2:
raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)

assert len(self) == _vector_size(other), "dimension mismatch"

elif type(other) in (array.array, DenseVector):
if type(other) in (np.ndarray, array.array, DenseVector):
result = 0.0
for i in xrange(len(self.indices)):
result += self.values[i] * other[self.indices[i]]
Expand All @@ -254,6 +328,7 @@ def dot(self, other):
else:
j += 1
return result

else:
return self.dot(_convert_to_vector(other))

Expand All @@ -273,7 +348,16 @@ def squared_distance(self, other):
30.0
>>> b.squared_distance(a)
30.0
>>> b.squared_distance([1., 2.])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
if type(other) is np.array and other.ndim != 1:
raise Exception("Cannot call squared_distance with %d-dimensional array" %
Expand Down Expand Up @@ -348,7 +432,6 @@ def __eq__(self, other):
>>> v1 != v2
False
"""

return (isinstance(other, self.__class__)
and other.size == self.size
and other.indices == self.indices
Expand Down Expand Up @@ -414,23 +497,32 @@ def stringify(vector):


class Matrix(object):
""" the Matrix """
def __init__(self, nRow, nCol):
self.nRow = nRow
self.nCol = nCol
"""
Represents a local matrix.
"""

def __init__(self, numRows, numCols):
self.numRows = numRows
self.numCols = numCols

def toArray(self):
"""
Returns its elements in a NumPy ndarray.
"""
raise NotImplementedError


class DenseMatrix(Matrix):
def __init__(self, nRow, nCol, values):
Matrix.__init__(self, nRow, nCol)
assert len(values) == nRow * nCol
"""
Column-major dense matrix.
"""
def __init__(self, numRows, numCols, values):
Matrix.__init__(self, numRows, numCols)
assert len(values) == numRows * numCols
self.values = values

def __reduce__(self):
return DenseMatrix, (self.nRow, self.nCol, self.values)
return DenseMatrix, (self.numRows, self.numCols, self.values)

def toArray(self):
"""
Expand All @@ -439,10 +531,10 @@ def toArray(self):
>>> arr = array.array('d', [float(i) for i in range(4)])
>>> m = DenseMatrix(2, 2, arr)
>>> m.toArray()
array([[ 0., 1.],
[ 2., 3.]])
array([[ 0., 2.],
[ 1., 3.]])
"""
return np.ndarray((self.nRow, self.nCol), np.float64, buffer=self.values.tostring())
return np.reshape(self.values, (self.numRows, self.numCols), order='F')


def _test():
Expand Down