-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-6227] [MLlib] [PySpark] Implement PySpark wrappers for SVD and PCA #7963
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6248d0e
921d5b6
5558fa9
50ed700
59f53d5
70a871d
0bc6a3c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -83,6 +83,25 @@ Applications](quick-start.html#self-contained-applications) section of the Spark | |
| quick-start guide. Be sure to also include *spark-mllib* to your build file as | ||
| a dependency. | ||
|
|
||
| </div> | ||
| <div data-lang="python" markdown="1"> | ||
| {% highlight python %} | ||
| from pyspark.mllib.linalg.distributed import RowMatrix | ||
| from numpy.random import RandomState | ||
|
|
||
| # Generate random data with 50 samples and 30 features. | ||
| rng = RandomState(0) | ||
| mat = RowMatrix(sc.parallelize(rng.randn(50, 30))) | ||
|
|
||
| # Compute the top 20 singular values and corresponding singular vectors. | ||
| svd = mat.computeSVD(20, computeU=True) | ||
| u = svd.U # The U factor is a RowMatrix. | ||
| s = svd.s # The singular values are stored in a local dense vector. | ||
| V = svd.V # The V factor is a local dense matrix. | ||
| {% endhighlight %} | ||
|
|
||
| The same code applies to `IndexedRowMatrix` if `U` is defined as an | ||
| `IndexedRowMatrix`. | ||
| </div> | ||
| </div> | ||
|
|
||
|
|
@@ -124,6 +143,29 @@ Refer to the [`RowMatrix` Java docs](api/java/org/apache/spark/mllib/linalg/dist | |
|
|
||
| {% include_example java/org/apache/spark/examples/mllib/JavaPCAExample.java %} | ||
|
|
||
| </div> | ||
|
|
||
| <div data-lang="python" markdown="1"> | ||
|
|
||
| The following code demonstrates how to compute principal components on a `RowMatrix` | ||
| and use them to project the vectors into a low-dimensional space. | ||
|
|
||
| {% highlight python %} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above |
||
| from pyspark.mllib.linalg.distributed import RowMatrix | ||
| from numpy.random import RandomState | ||
|
|
||
| # Generate random data with 50 samples and 30 features. | ||
| rng = RandomState(0) | ||
| data = sc.parallelize(rng.randn(50, 30)) | ||
| mat = RowMatrix(data) | ||
|
|
||
| # Compute the top 10 principal components stored in a local dense matrix. | ||
| pc = rm.computePrincipalComponents(10) | ||
|
|
||
| # Project the rows to the linear space spanned by the top 10 principal components. | ||
| projected = rm.multiply(pc) | ||
| {% endhighlight %} | ||
|
|
||
| </div> | ||
| </div> | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,7 @@ | |
|
|
||
| from pyspark import RDD, since | ||
| from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper | ||
| from pyspark.mllib.linalg import _convert_to_vector, Matrix, QRDecomposition | ||
| from pyspark.mllib.linalg import _convert_to_vector, DenseMatrix, Matrix, QRDecomposition | ||
| from pyspark.mllib.stat import MultivariateStatisticalSummary | ||
| from pyspark.storagelevel import StorageLevel | ||
|
|
||
|
|
@@ -303,6 +303,121 @@ def tallSkinnyQR(self, computeQ=False): | |
| R = decomp.call("R") | ||
| return QRDecomposition(Q, R) | ||
|
|
||
| def computeSVD(self, k, computeU=False, rCond=1e-9): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be good to add a since annotation here |
||
| """ | ||
| Computes the singular value decomposition of the RowMatrix. | ||
|
|
||
| The given row matrix A of dimension (m X n) is decomposed into | ||
| U * s * V'T where | ||
|
|
||
| * U: (m X k) (left singular vectors) is a RowMatrix whose | ||
| columns are the eigenvectors of (A X A') | ||
| * s: DenseVector consisting of square root of the eigenvalues | ||
| (singular values) in descending order. | ||
| * v: (n X k) (right singular vectors) is a Matrix whose columns | ||
| are the eigenvectors of (A' X A) | ||
|
|
||
| For more specific details on implementation, please refer | ||
| the scala documentation. | ||
|
|
||
| :param k: Set the number of singular values to keep. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be good to copy the longer description from RowMatrix for the k param |
||
| :param computeU: Whether or not to compute U. If set to be | ||
| True, then U is computed by A * V * s^-1 | ||
| :param rCond: Reciprocal condition number. All singular values | ||
| smaller than rCond * s[0] are treated as zero | ||
| where s[0] is the largest singular value. | ||
| :returns: SingularValueDecomposition object | ||
|
|
||
| >>> data = [(3, 1, 1), (-1, 3, 1)] | ||
| >>> rm = RowMatrix(sc.parallelize(data)) | ||
| >>> svd_model = rm.computeSVD(2, True) | ||
| >>> svd_model.U.rows.collect() | ||
| [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])] | ||
| >>> svd_model.s | ||
| DenseVector([3.4641, 3.1623]) | ||
| >>> svd_model.V | ||
| DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) | ||
| """ | ||
| j_model = self._java_matrix_wrapper.call( | ||
| "computeSVD", int(k), bool(computeU), float(rCond)) | ||
| return SingularValueDecomposition(j_model) | ||
|
|
||
| def computePrincipalComponents(self, k): | ||
| """ | ||
| Computes the k principal components of the given row matrix | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be good to copy the warnings form RowMatrix here as well. |
||
|
|
||
| :param k: Number of principal components to keep. | ||
| :returns: DenseMatrix | ||
|
|
||
| >>> data = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]]) | ||
| >>> rm = RowMatrix(data) | ||
|
|
||
| >>> # Returns the two principal components of rm | ||
| >>> pca = rm.computePrincipalComponents(2) | ||
| >>> pca | ||
| DenseMatrix(3, 2, [-0.349, -0.6981, 0.6252, -0.2796, -0.5592, -0.7805], 0) | ||
|
|
||
| >>> # Transform into new dimensions with the greatest variance. | ||
| >>> rm.multiply(pca).rows.collect() # doctest: +NORMALIZE_WHITESPACE | ||
| [DenseVector([0.1305, -3.7394]), DenseVector([-0.3642, -6.6983]), \ | ||
| DenseVector([-4.6102, -4.9745])] | ||
| """ | ||
| return self._java_matrix_wrapper.call("computePrincipalComponents", k) | ||
|
|
||
| def multiply(self, matrix): | ||
| """ | ||
| Multiplies the given RowMatrix with another matrix. | ||
|
|
||
| :param matrix: Matrix to multiply with. | ||
| :returns: RowMatrix | ||
|
|
||
| >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]])) | ||
| >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() | ||
| [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])] | ||
| """ | ||
| if not isinstance(matrix, DenseMatrix): | ||
| raise ValueError("Only multiplication with DenseMatrix " | ||
| "is supported.") | ||
| j_model = self._java_matrix_wrapper.call("multiply", matrix) | ||
| return RowMatrix(j_model) | ||
|
|
||
|
|
||
| class SingularValueDecomposition(JavaModelWrapper): | ||
| """Wrapper around the SingularValueDecomposition scala case class""" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably add a versionAdded |
||
|
|
||
| @property | ||
| def U(self): | ||
| """ | ||
| Returns a distributed matrix whose columns are the left | ||
| singular vectors of the SingularValueDecomposition if | ||
| computeU was set to be True. | ||
| """ | ||
| u = self.call("U") | ||
| if u is not None: | ||
| mat_name = u.getClass().getSimpleName() | ||
| if mat_name == "RowMatrix": | ||
| return RowMatrix(u) | ||
| elif mat_name == "IndexedRowMatrix": | ||
| return IndexedRowMatrix(u) | ||
| else: | ||
| raise TypeError("Expected RowMatrix/IndexedRowMatrix got %s" % mat_name) | ||
|
|
||
| @property | ||
| def s(self): | ||
| """ | ||
| Returns a DenseVector with singular values in | ||
| descending order. | ||
| """ | ||
| return self.call("s") | ||
|
|
||
| @property | ||
| def V(self): | ||
| """ | ||
| Returns a DenseMatrix whose columns are the right singular | ||
| vectors of the SingularValueDecomposition. | ||
| """ | ||
| return self.call("V") | ||
|
|
||
|
|
||
| class IndexedRow(object): | ||
| """ | ||
|
|
@@ -533,6 +648,62 @@ def toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024): | |
| colsPerBlock) | ||
| return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock) | ||
|
|
||
| def computeSVD(self, k, computeU=False, rCond=1e-9): | ||
| """ | ||
| Computes the singular value decomposition of the IndexedRowMatrix. | ||
|
|
||
| The given row matrix A of dimension (m X n) is decomposed into | ||
| U * s * V'T where | ||
|
|
||
| * U: (m X k) (left singular vectors) is a IndexedRowMatrix | ||
| whose columns are the eigenvectors of (A X A') | ||
| * s: DenseVector consisting of square root of the eigenvalues | ||
| (singular values) in descending order. | ||
| * v: (n X k) (right singular vectors) is a Matrix whose columns | ||
| are the eigenvectors of (A' X A) | ||
|
|
||
| For more specific details on implementation, please refer | ||
| the scala documentation. | ||
|
|
||
| :param k: Set the number of singular values to keep. | ||
| :param computeU: Whether or not to compute U. If set to be | ||
| True, then U is computed by A * V * s^-1 | ||
| :param rCond: Reciprocal condition number. All singular values | ||
| smaller than rCond * s[0] are treated as zero | ||
| where s[0] is the largest singular value. | ||
| :returns: SingularValueDecomposition object | ||
|
|
||
| >>> data = [(0, (3, 1, 1)), (1, (-1, 3, 1))] | ||
| >>> irm = IndexedRowMatrix(sc.parallelize(data)) | ||
| >>> svd_model = irm.computeSVD(2, True) | ||
| >>> svd_model.U.rows.collect() # doctest: +NORMALIZE_WHITESPACE | ||
| [IndexedRow(0, [-0.707106781187,0.707106781187]),\ | ||
| IndexedRow(1, [-0.707106781187,-0.707106781187])] | ||
| >>> svd_model.s | ||
| DenseVector([3.4641, 3.1623]) | ||
| >>> svd_model.V | ||
| DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0) | ||
| """ | ||
| j_model = self._java_matrix_wrapper.call( | ||
| "computeSVD", int(k), bool(computeU), float(rCond)) | ||
| return SingularValueDecomposition(j_model) | ||
|
|
||
| def multiply(self, matrix): | ||
| """ | ||
| Multiplies the given IndexedRowMatrix with another matrix. | ||
|
|
||
| :param matrix: Matrix to multiply with. | ||
| :returns: IndexedRowMatrix | ||
|
|
||
| >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))])) | ||
| >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect() | ||
| [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])] | ||
| """ | ||
| if not isinstance(matrix, DenseMatrix): | ||
| raise ValueError("Only multiplication with DenseMatrix " | ||
| "is supported.") | ||
| return IndexedRowMatrix(self._java_matrix_wrapper.call("multiply", matrix)) | ||
|
|
||
|
|
||
| class MatrixEntry(object): | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now days we tend write new examples separately and then use the include example syntax to bring them