-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-16992][PYSPARK][DOCS] import sort and autopep8 on Pyspark examples #14830
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
78b66d8
4af5966
ef1306e
31cea6d
582c822
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,8 +26,8 @@ | |
| import sys | ||
|
|
||
| import numpy as np | ||
| from numpy.random import rand | ||
| from numpy import matrix | ||
| from numpy.random import rand | ||
| from pyspark.sql import SparkSession | ||
|
|
||
| LAMBDA = 0.01 # regularization | ||
|
|
@@ -62,10 +62,10 @@ def update(i, mat, ratings): | |
| example. Please use pyspark.ml.recommendation.ALS for more | ||
| conventional use.""", file=sys.stderr) | ||
|
|
||
| spark = SparkSession\ | ||
| .builder\ | ||
| .appName("PythonALS")\ | ||
| .getOrCreate() | ||
| spark = (SparkSession | ||
|
||
| .builder | ||
| .appName("PythonALS") | ||
| .getOrCreate()) | ||
|
|
||
| sc = spark.sparkContext | ||
|
|
||
|
|
@@ -87,17 +87,19 @@ def update(i, mat, ratings): | |
| usb = sc.broadcast(us) | ||
|
|
||
| for i in range(ITERATIONS): | ||
| ms = sc.parallelize(range(M), partitions) \ | ||
| .map(lambda x: update(x, usb.value, Rb.value)) \ | ||
| .collect() | ||
| ms = (sc | ||
| .parallelize(range(M), partitions) | ||
| .map(lambda x: update(x, usb.value, Rb.value)) | ||
| .collect()) | ||
| # collect() returns a list, so array ends up being | ||
| # a 3-d array, we take the first 2 dims for the matrix | ||
| ms = matrix(np.array(ms)[:, :, 0]) | ||
| msb = sc.broadcast(ms) | ||
|
|
||
| us = sc.parallelize(range(U), partitions) \ | ||
| .map(lambda x: update(x, msb.value, Rb.value.T)) \ | ||
| .collect() | ||
| us = (sc | ||
| .parallelize(range(U), partitions) | ||
| .map(lambda x: update(x, msb.value, Rb.value.T)) | ||
| .collect()) | ||
| us = matrix(np.array(us)[:, :, 0]) | ||
| usb = sc.broadcast(us) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,23 +17,26 @@ | |
|
|
||
| from __future__ import print_function | ||
|
|
||
| from pyspark.sql import SparkSession | ||
| # $example on$ | ||
| from pyspark.ml.feature import CountVectorizer | ||
| # $example off$ | ||
| from pyspark.sql import SparkSession | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| spark = SparkSession\ | ||
| .builder\ | ||
| .appName("CountVectorizerExample")\ | ||
| .getOrCreate() | ||
| spark = (SparkSession | ||
| .builder | ||
| .appName("CountVectorizerExample") | ||
| .getOrCreate()) | ||
|
|
||
| # $example on$ | ||
| # Input data: Each row is a bag of words with a ID. | ||
| df = spark.createDataFrame([ | ||
| (0, "a b c".split(" ")), | ||
| (1, "a b b c a".split(" ")) | ||
| ], ["id", "words"]) | ||
| df = spark.createDataFrame( | ||
| [ | ||
| (0, "a b c".split(" ")), | ||
| (1, "a b b c a".split(" ")) | ||
| ], | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you double check if it really does not follow pep8? I have seen the removed syntax more often (e.g.,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, this is a recommendation not an obligation. I see it to be more looking like Scala multi-line code, and I prefer it. It is a personal opinion, and I don't think there is a pylint/pep8 check to prevent using . |
||
| ["id", "words"]) | ||
|
|
||
| # fit a CountVectorizerModel from the corpus. | ||
| cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,27 +35,29 @@ | |
| """ | ||
|
|
||
| if __name__ == "__main__": | ||
| spark = SparkSession\ | ||
| .builder\ | ||
| .appName("CrossValidatorExample")\ | ||
| .getOrCreate() | ||
| spark = (SparkSession | ||
| .builder | ||
| .appName("CrossValidatorExample") | ||
| .getOrCreate()) | ||
|
|
||
| # $example on$ | ||
| # Prepare training documents, which are labeled. | ||
| training = spark.createDataFrame([ | ||
| (0, "a b c d e spark", 1.0), | ||
| (1, "b d", 0.0), | ||
| (2, "spark f g h", 1.0), | ||
| (3, "hadoop mapreduce", 0.0), | ||
| (4, "b spark who", 1.0), | ||
| (5, "g d a y", 0.0), | ||
| (6, "spark fly", 1.0), | ||
| (7, "was mapreduce", 0.0), | ||
| (8, "e spark program", 1.0), | ||
| (9, "a e c l", 0.0), | ||
| (10, "spark compile", 1.0), | ||
| (11, "hadoop software", 0.0) | ||
| ], ["id", "text", "label"]) | ||
| training = spark.createDataFrame( | ||
| [ | ||
|
||
| (0, "a b c d e spark", 1.0), | ||
| (1, "b d", 0.0), | ||
| (2, "spark f g h", 1.0), | ||
| (3, "hadoop mapreduce", 0.0), | ||
| (4, "b spark who", 1.0), | ||
| (5, "g d a y", 0.0), | ||
| (6, "spark fly", 1.0), | ||
| (7, "was mapreduce", 0.0), | ||
| (8, "e spark program", 1.0), | ||
| (9, "a e c l", 0.0), | ||
| (10, "spark compile", 1.0), | ||
| (11, "hadoop software", 0.0) | ||
| ], | ||
| ["id", "text", "label"]) | ||
|
|
||
| # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. | ||
| tokenizer = Tokenizer(inputCol="text", outputCol="words") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
replace backslash syntax with more elegant (and pep8-recommended) parenthesis syntax
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK. I don't feel so qualified to judge that, but take your word for it. However do you really want to indent this so much?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
my point was on the use of parenthesis instead of thr backslash, which is recommended by pep8. I can keep the indentation.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe I am wrong. Could you maybe provide the reference?
Do you refer this line?
I know the rule with binary operator follows this but I guess this case is not disallowed. I am not sure if it is worth sweeping all. They look preferred but not breaking pep8. I mean, it seems not discouraging this line break..