@@ -210,14 +210,19 @@ row. Any RDD of dictionaries can converted to a SchemaRDD and then registered as
210210can be used in subsequent SQL statements.
211211
212212{% highlight python %}
213+ # Load a text file and convert each line to a dictionary.
213214lines = sc.textFile("examples/src/main/resources/people.txt")
214215parts = lines.map(lambda l: l.split(","))
215216people = parts.map(lambda p: {"name": p[ 0] , "age": int(p[ 1] )})
216217
218+ # Infer the schema, and register the SchemaRDD as a table.
217219peopleTable = sqlCtx.inferSchema(people)
218220peopleTable.registerAsTable("people")
219221
222+ # SQL can be run over SchemaRDDs that have been registered as a table.
220223teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
224+
225+ # The results of SQL queries are RDDs and support all the normal RDD operations.
221226teenNames = teenagers.map(lambda p: "Name: " + p.name)
222227{% endhighlight %}
223228
@@ -291,11 +296,11 @@ peopleTable # The SchemaRDD from the previous example.
291296# JavaSchemaRDDs can be saved as parquet files, maintaining the schema information.
292297peopleTable.saveAsParquetFile("people.parquet")
293298
294- // Read in the parquet file created above. Parquet files are self-describing so the schema is preserved.
295- // The result of loading a parquet file is also a JavaSchemaRDD.
299+ # Read in the parquet file created above. Parquet files are self-describing so the schema is preserved.
300+ # The result of loading a parquet file is also a JavaSchemaRDD.
296301parquetFile = sqlCtx.parquetFile("people.parquet")
297302
298- // Parquet files can also be registered as tables and then used in SQL statements.
303+ # Parquet files can also be registered as tables and then used in SQL statements.
299304parquetFile.registerAsTable("parquetFile");
300305teenagers = sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
301306
@@ -401,7 +406,7 @@ hiveCtx = HiveContext(sqlCtx)
401406hiveCtx.hql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
402407hiveCtx.hql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
403408
404- // Queries are expressed in HiveQL.
409+ # Queries can be expressed in HiveQL.
405410results = hiveCtx.hql("FROM src SELECT key, value").collect()
406411
407412{% endhighlight %}
0 commit comments