apache
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala‎
Lines changed: 4 additions & 6 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎examples/src/main/python/streaming/hdfs_wordcount.py‎
Lines changed: 49 additions & 0 deletions b/‎examples/src/main/python/streaming/hdfs_wordcount.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎examples/src/main/python/streaming/network_wordcount.py‎
Lines changed: 48 additions & 0 deletions b/‎examples/src/main/python/streaming/network_wordcount.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎examples/src/main/python/streaming/stateful_network_wordcount.py‎
Lines changed: 57 additions & 0 deletions b/‎examples/src/main/python/streaming/stateful_network_wordcount.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎python/docs/conf.py‎
Lines changed: 1 addition & 1 deletion b/‎python/docs/conf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/docs/epytext.py‎
Lines changed: 1 addition & 1 deletion b/‎python/docs/epytext.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/docs/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎python/docs/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/docs/pyspark.rst‎
Lines changed: 2 additions & 1 deletion b/‎python/docs/pyspark.rst‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/pyspark/context.py‎
Lines changed: 4 additions & 4 deletions b/‎python/pyspark/context.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎python/pyspark/mllib/feature.py‎
Lines changed: 2 additions & 0 deletions b/‎python/pyspark/mllib/feature.py‎
Lines changed: 2 additions & 0 deletions
@@ -25,8 +25,6 @@ import java.util.{List => JList, ArrayList => JArrayList, Map => JMap, Collectio
 import scala.collection.JavaConversions._
 import scala.collection.mutable
 import scala.language.existentials
-import scala.reflect.ClassTag
-import scala.util.{Try, Success, Failure}
 
 import net.razorvine.pickle.{Pickler, Unpickler}
 
@@ -42,7 +40,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.util.Utils
 
 private[spark] class PythonRDD(
-    parent: RDD[_],
+    @transient parent: RDD[_],
     command: Array[Byte],
     envVars: JMap[String, String],
     pythonIncludes: JList[String],
@@ -55,9 +53,9 @@ private[spark] class PythonRDD(
   val bufferSize = conf.getInt("spark.buffer.size", 65536)
   val reuse_worker = conf.getBoolean("spark.python.worker.reuse", true)
 
-  override def getPartitions = parent.partitions
+  override def getPartitions = firstParent.partitions
 
-  override val partitioner = if (preservePartitoning) parent.partitioner else None
+  override val partitioner = if (preservePartitoning) firstParent.partitioner else None
 
   override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
     val startTime = System.currentTimeMillis
@@ -234,7 +232,7 @@ private[spark] class PythonRDD(
         dataOut.writeInt(command.length)
         dataOut.write(command)
         // Data values
-        PythonRDD.writeIteratorToStream(parent.iterator(split, context), dataOut)
+        PythonRDD.writeIteratorToStream(firstParent.iterator(split, context), dataOut)
         dataOut.writeInt(SpecialLengths.END_OF_DATA_SECTION)
         dataOut.flush()
       } catch {
 
@@ -0,0 +1,49 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+ Counts words in new text files created in the given directory
+ Usage: hdfs_wordcount.py <directory>
+   <directory> is the directory that Spark Streaming will use to find and read new text files.
+
+ To run this on your local machine on directory `localdir`, run this example
+    $ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localdir
+
+ Then create a text file in `localdir` and the words in the file will get counted.
+"""
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.streaming import StreamingContext
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print >> sys.stderr, "Usage: hdfs_wordcount.py <directory>"
+        exit(-1)
+
+    sc = SparkContext(appName="PythonStreamingHDFSWordCount")
+    ssc = StreamingContext(sc, 1)
+
+    lines = ssc.textFileStream(sys.argv[1])
+    counts = lines.flatMap(lambda line: line.split(" "))\
+                  .map(lambda x: (x, 1))\
+                  .reduceByKey(lambda a, b: a+b)
+    counts.pprint()
+
+    ssc.start()
+    ssc.awaitTermination()
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+ Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
+ Usage: network_wordcount.py <hostname> <port>
+   <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive data.
+
+ To run this on your local machine, you need to first run a Netcat server
+    `$ nc -lk 9999`
+ and then run the example
+    `$ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localhost 9999`
+"""
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.streaming import StreamingContext
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: network_wordcount.py <hostname> <port>"
+        exit(-1)
+    sc = SparkContext(appName="PythonStreamingNetworkWordCount")
+    ssc = StreamingContext(sc, 1)
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    counts = lines.flatMap(lambda line: line.split(" "))\
+                  .map(lambda word: (word, 1))\
+                  .reduceByKey(lambda a, b: a+b)
+    counts.pprint()
+
+    ssc.start()
+    ssc.awaitTermination()
@@ -0,0 +1,57 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+ Counts words in UTF8 encoded, '\n' delimited text received from the
+ network every second.
+
+ Usage: stateful_network_wordcount.py <hostname> <port>
+   <hostname> and <port> describe the TCP server that Spark Streaming
+    would connect to receive data.
+
+ To run this on your local machine, you need to first run a Netcat server
+    `$ nc -lk 9999`
+ and then run the example
+    `$ bin/spark-submit examples/src/main/python/streaming/stateful_network_wordcount.py \
+        localhost 9999`
+"""
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.streaming import StreamingContext
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print >> sys.stderr, "Usage: stateful_network_wordcount.py <hostname> <port>"
+        exit(-1)
+    sc = SparkContext(appName="PythonStreamingStatefulNetworkWordCount")
+    ssc = StreamingContext(sc, 1)
+    ssc.checkpoint("checkpoint")
+
+    def updateFunc(new_values, last_sum):
+        return sum(new_values) + (last_sum or 0)
+
+    lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
+    running_counts = lines.flatMap(lambda line: line.split(" "))\
+                          .map(lambda word: (word, 1))\
+                          .updateStateByKey(updateFunc)
+
+    running_counts.pprint()
+
+    ssc.start()
+    ssc.awaitTermination()
@@ -131,7 +131,7 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+#html_static_path = ['_static']
 
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
 
@@ -5,7 +5,7 @@
     (r"L{([\w.()]+)}", r":class:`\1`"),
     (r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"),
     (r"C{([\w.()]+)}", r":class:`\1`"),
-    (r"[IBCM]{(.+)}", r"`\1`"),
+    (r"[IBCM]{([^}]+)}", r"`\1`"),
     ('pyspark.rdd.RDD', 'RDD'),
 )
 
 
@@ -13,6 +13,7 @@ Contents:
 
    pyspark
    pyspark.sql
+   pyspark.streaming
    pyspark.mllib
 
 
 
@@ -7,8 +7,9 @@ Subpackages
 .. toctree::
     :maxdepth: 1
 
-    pyspark.mllib
     pyspark.sql
+    pyspark.streaming
+    pyspark.mllib
 
 Contents
 --------
 
@@ -68,7 +68,7 @@ class SparkContext(object):
 
     def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                  environment=None, batchSize=0, serializer=PickleSerializer(), conf=None,
-                 gateway=None):
+                 gateway=None, jsc=None):
         """
         Create a new SparkContext. At least the master and app name should be set,
         either through the named parameters here or through C{conf}.
@@ -104,14 +104,14 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
         SparkContext._ensure_initialized(self, gateway=gateway)
         try:
             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
-                          conf)
+                          conf, jsc)
         except:
             # If an error occurs, clean up in order to allow future SparkContext creation:
             self.stop()
             raise
 
     def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
-                 conf):
+                 conf, jsc):
         self.environment = environment or {}
         self._conf = conf or SparkConf(_jvm=self._jvm)
         self._batchSize = batchSize  # -1 represents an unlimited batch size
@@ -154,7 +154,7 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
                 self.environment[varName] = v
 
         # Create the Java SparkContext through Py4J
-        self._jsc = self._initialize_context(self._conf._jconf)
+        self._jsc = jsc or self._initialize_context(self._conf._jconf)
 
         # Create a single Accumulator in Java that we'll send all our updates through;
         # they will be passed back to us through a TCP server
 
@@ -44,6 +44,7 @@ def transform(self, word):
         """
         :param word: a word
         :return: vector representation of word
+
         Transforms a word to its vector representation
 
         Note: local use only
@@ -57,6 +58,7 @@ def findSynonyms(self, x, num):
         :param x: a word or a vector representation of word
         :param num: number of synonyms to find
         :return: array of (word, cosineSimilarity)
+
         Find synonyms of a word
 
         Note: local use only
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	(r"L{([\w.()]+)}", r":class:`\1`"),
`6`	`6`	(r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"),
`7`	`7`	(r"C{([\w.()]+)}", r":class:`\1`"),
`8`		- (r"[IBCM]{(.+)}", r"`\1`"),
	`8`	+ (r"[IBCM]{([^}]+)}", r"`\1`"),
`9`	`9`	`('pyspark.rdd.RDD', 'RDD'),`
`10`	`10`	`)`
`11`	`11`