SchemaRDD.javaToPython should convert a field with the StructType to a Map.

yhuai · yhuai · commit e7a6c19308ab · 2014-06-16T11:48:24.000-07:00
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
@@ -238,6 +238,8 @@ teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 
 # The results of SQL queries are RDDs and support all the normal RDD operations.
 teenNames = teenagers.map(lambda p: "Name: " + p.name)
+for teenName in teenNames.collect():
+  print teenName
 {% endhighlight %}
 
 </div>
@@ -275,7 +277,7 @@ val parquetFile = sqlCtx.parquetFile("people.parquet")
 //Parquet files can also be registered as tables and then used in SQL statements.
 parquetFile.registerAsTable("parquetFile")
 val teenagers = sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
-teenagers.collect().foreach(println)
+teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 {% endhighlight %}
 
 </div>
@@ -311,10 +313,10 @@ List<String> teenagerNames = teenagers.map(new Function<Row, String>() {
 {% highlight python %}
 # sqlCtx from the previous example is used in this example.
 
-peopleTable # The SchemaRDD from the previous example.
+schemaPeople # The SchemaRDD from the previous example.
 
 # SchemaRDDs can be saved as Parquet files, maintaining the schema information.
-peopleTable.saveAsParquetFile("people.parquet")
+schemaPeople.saveAsParquetFile("people.parquet")
 
 # Read in the Parquet file created above.  Parquet files are self-describing so the schema is preserved.
 # The result of loading a parquet file is also a SchemaRDD.
@@ -324,6 +326,8 @@ parquetFile = sqlCtx.parquetFile("people.parquet")
 parquetFile.registerAsTable("parquetFile");
 teenagers = sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
 teenNames = teenagers.map(lambda p: "Name: " + p.name)
+for teenName in teenNames.collect():
+  print teenName
 {% endhighlight %}
 
 </div>
@@ -477,11 +481,13 @@ people.printSchema()
 people.registerAsTable("people")
 
 # SQL statements can be run by using the sql methods provided by sqlCtx.
-val teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
+teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
 
 # The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
 # The columns of a row in the result can be accessed by ordinal.
 teenNames = teenagers.map(lambda p: "Name: " + p.name)
+for teenName in teenNames.collect():
+  print teenName
 
 # Alternatively, a SchemaRDD can be created for a JSON dataset represented by
 # a RDD[String] storing one JSON object per string.
@@ -597,6 +603,7 @@ val people: RDD[Person] = ... // An RDD of case class objects, from the first ex
 
 // The following is the same as 'SELECT name FROM people WHERE age >= 10 AND age <= 19'
 val teenagers = people.where('age >= 10).where('age <= 19).select('name)
+teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 {% endhighlight %}
 
 The DSL uses Scala symbols to represent columns in the underlying table, which are identifiers
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
@@ -138,9 +138,10 @@ def jsonFile(self, path):
         >>> ofn.close()
         >>> srdd = sqlCtx.jsonFile(jsonFile)
         >>> sqlCtx.registerRDDAsTable(srdd, "table1")
-        >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
-        >>> srdd2.collect() == [{"f1" : 1, "f2" : "row1"}, {"f1" : 2, "f2": "row2"},
-        ...                     {"f1" : 3, "f2": "row3"}]
+        >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1")
+        >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},
+        ...                     {"f1": 2, "f2": "row2", "f3":{"field4":22}},
+        ...                     {"f1": 3, "f2": "row3", "f3":{"field4":33}}]
         True
         """
         jschema_rdd = self._ssql_ctx.jsonFile(path)
@@ -151,9 +152,10 @@ def jsonRDD(self, rdd):
 
         >>> srdd = sqlCtx.jsonRDD(json)
         >>> sqlCtx.registerRDDAsTable(srdd, "table1")
-        >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2 from table1")
-        >>> srdd2.collect() == [{"f1" : 1, "f2" : "row1"}, {"f1" : 2, "f2": "row2"},
-        ...                     {"f1" : 3, "f2": "row3"}]
+        >>> srdd2 = sqlCtx.sql("SELECT field1 AS f1, field2 as f2, field3 as f3 from table1")
+        >>> srdd2.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},
+        ...                     {"f1": 2, "f2": "row2", "f3":{"field4":22}},
+        ...                     {"f1": 3, "f2": "row3", "f3":{"field4":33}}]
         True
         """
         def func(split, iterator):
@@ -369,7 +371,7 @@ def saveAsTable(self, tableName):
 
     def getSchemaTreeString(self):
         """Returns the output schema in the tree format."""
-        self._jschema_rdd.getSchemaTreeString()
+        return self._jschema_rdd.getSchemaTreeString()
 
     def printSchema(self):
         """Prints out the schema in the tree format."""
@@ -473,8 +475,9 @@ def _test():
     globs['sqlCtx'] = SQLContext(sc)
     globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"},
         {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}])
-    jsonStrings = ['{"field1": 1, "field2": "row1"}',
-       '{"field1" : 2, "field2": "row2"}', '{"field1" : 3, "field2": "row3"}']
+    jsonStrings = ['{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
+       '{"field1" : 2, "field2": "row2", "field3":{"field4":22}}',
+       '{"field1" : 3, "field2": "row3", "field3":{"field4":33}}']
     globs['jsonStrings'] = jsonStrings
     globs['json'] = sc.parallelize(jsonStrings)
     (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
-import org.apache.spark.sql.catalyst.types.BooleanType
+import org.apache.spark.sql.catalyst.types.{DataType, StructType, BooleanType}
 import org.apache.spark.sql.execution.{ExistingRdd, SparkLogicalPlan}
 import org.apache.spark.api.java.JavaRDD
 import java.util.{Map => JMap}
@@ -344,13 +344,34 @@ class SchemaRDD(
   def toJavaSchemaRDD: JavaSchemaRDD = new JavaSchemaRDD(sqlContext, logicalPlan)
 
   private[sql] def javaToPython: JavaRDD[Array[Byte]] = {
-    val fieldNames: Seq[String] = this.queryExecution.analyzed.output.map(_.name)
+    def rowToMap(row: Row, structType: StructType): JMap[String, Any] = {
+      val fields = structType.fields.map(field => (field.name, field.dataType))
+      val map: JMap[String, Any] = new java.util.HashMap
+      row.zip(fields).foreach {
+        case (obj, (name, dataType)) =>
+          dataType match {
+            case struct: StructType => map.put(name, rowToMap(obj.asInstanceOf[Row], struct))
+            case other => map.put(name, obj)
+          }
+      }
+
+      map
+    }
+
+    // TODO: Actually, the schema of a row should be represented by a StructType instead of
+    // a Seq[Attribute]. Once we have finished that change, we can just use rowToMap to
+    // construct the Map for python.
+    val fields: Seq[(String, DataType)] = this.queryExecution.analyzed.output.map(
+      field => (field.name, field.dataType))
     this.mapPartitions { iter =>
       val pickle = new Pickler
       iter.map { row =>
         val map: JMap[String, Any] = new java.util.HashMap
-        row.zip(fieldNames).foreach { case (obj, name) =>
-          map.put(name, obj)
+        row.zip(fields).foreach { case (obj, (name, dataType)) =>
+          dataType match {
+            case struct: StructType => map.put(name, rowToMap(obj.asInstanceOf[Row], struct))
+            case other => map.put(name, obj)
+          }
         }
         map
       }.grouped(10).map(batched => pickle.dumps(batched.toArray))