From 0bed96ecbbefaad0e8f611967e3df62feb7482ff Mon Sep 17 00:00:00 2001 From: Wang Gengliang Date: Fri, 23 Jun 2017 10:39:59 -0700 Subject: [PATCH 1/5] add codegenToSeq method: split codegen info into sequence --- .../spark/sql/execution/QueryExecution.scala | 5 ++++ .../spark/sql/execution/debug/package.scala | 25 +++++++++++++------ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index 34998cbd61552..1b8e1a23442d9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -245,5 +245,10 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) { println(org.apache.spark.sql.execution.debug.codegenString(executedPlan)) // scalastyle:on println } + + /** @return Sequence of WholeStageCodegen subtrees and corresponding codegen */ + def codegenToSeq(): Seq[(String, String)] = { + org.apache.spark.sql.execution.debug.codegenStringSeq(executedPlan) + } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index 0395c43ba2cbc..108c2cb627679 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -51,6 +51,19 @@ package object debug { } def codegenString(plan: SparkPlan): String = { + val codegenSeq = codegenStringSeq(plan) + var output = s"Found ${codegenSeq.size} WholeStageCodegen subtrees.\n" + for (((subtree, code), i) <- codegenSeq.zipWithIndex) { + output += s"== Subtree ${i + 1} / ${codegenSeq.size} ==\n" + output += subtree + output += "\nGenerated code:\n" + output += s"${code}\n" + } + output + } + + /** @return Sequence of WholeStageCodegen subtrees and corresponding codegen */ + def codegenStringSeq(plan: SparkPlan): Seq[(String, String)] = { val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() plan transform { case s: WholeStageCodegenExec => @@ -58,15 +71,11 @@ package object debug { s case s => s } - var output = s"Found ${codegenSubtrees.size} WholeStageCodegen subtrees.\n" - for ((s, i) <- codegenSubtrees.toSeq.zipWithIndex) { - output += s"== Subtree ${i + 1} / ${codegenSubtrees.size} ==\n" - output += s - output += "\nGenerated code:\n" - val (_, source) = s.doCodeGen() - output += s"${CodeFormatter.format(source)}\n" + codegenSubtrees.toSeq.map { + subtree => + val (_, source) = subtree.doCodeGen() + (subtree.toString, CodeFormatter.format(source)) } - output } /** From 73ef241530d805ef9dca76e5412f3f3488369fc6 Mon Sep 17 00:00:00 2001 From: Wang Gengliang Date: Fri, 23 Jun 2017 11:38:07 -0700 Subject: [PATCH 2/5] add test cases and comments --- .../org/apache/spark/sql/execution/debug/package.scala | 8 +++++++- .../apache/spark/sql/execution/debug/DebuggingSuite.scala | 7 +++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index 108c2cb627679..2f650c39525c2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -50,6 +50,7 @@ package object debug { // scalastyle:on println } + /** Generate codegen debug info */ def codegenString(plan: SparkPlan): String = { val codegenSeq = codegenStringSeq(plan) var output = s"Found ${codegenSeq.size} WholeStageCodegen subtrees.\n" @@ -62,7 +63,12 @@ package object debug { output } - /** @return Sequence of WholeStageCodegen subtrees and corresponding codegen */ + /** + * Find WholeStageCodegenExec subtrees in query plan and do codegen for each of them + * + * @param plan the query plan for codegen + * @return Sequence of WholeStageCodegen subtrees and corresponding codegen + */ def codegenStringSeq(plan: SparkPlan): Seq[(String, String)] = { val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() plan transform { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala index 4fc52c99fbeeb..3d23b62688200 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala @@ -38,4 +38,11 @@ class DebuggingSuite extends SparkFunSuite with SharedSQLContext { assert(res.contains("Subtree 2 / 2")) assert(res.contains("Object[]")) } + + test("debugCodegenStringSeq") { + val res = codegenStringSeq(spark.range(10).groupBy("id").count().queryExecution.executedPlan) + assert(res.length == 2) + assert(res.seq.forall{case (subtree, code) => + subtree.contains("Range") && code.contains("Object[]")}) + } } From 42238b862e7b6e984967139e68259d8a5caae20f Mon Sep 17 00:00:00 2001 From: Wang Gengliang Date: Fri, 23 Jun 2017 15:24:24 -0700 Subject: [PATCH 3/5] revise code style and comments --- .../org/apache/spark/sql/execution/QueryExecution.scala | 6 +++++- .../org/apache/spark/sql/execution/debug/package.scala | 5 ++--- .../apache/spark/sql/execution/debug/DebuggingSuite.scala | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index 1b8e1a23442d9..6272e612573c6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -246,7 +246,11 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) { // scalastyle:on println } - /** @return Sequence of WholeStageCodegen subtrees and corresponding codegen */ + /** + * Get WholeStageCodegenExec subtrees and the codegen in a query plan + * + * @return Sequence of WholeStageCodegen subtrees and corresponding codegen + */ def codegenToSeq(): Seq[(String, String)] = { org.apache.spark.sql.execution.debug.codegenStringSeq(executedPlan) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index 2f650c39525c2..8136812af3e5e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -64,7 +64,7 @@ package object debug { } /** - * Find WholeStageCodegenExec subtrees in query plan and do codegen for each of them + * Get WholeStageCodegenExec subtrees and the codegen in a query plan * * @param plan the query plan for codegen * @return Sequence of WholeStageCodegen subtrees and corresponding codegen @@ -77,8 +77,7 @@ package object debug { s case s => s } - codegenSubtrees.toSeq.map { - subtree => + codegenSubtrees.toSeq.map { subtree => val (_, source) = subtree.doCodeGen() (subtree.toString, CodeFormatter.format(source)) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala index 3d23b62688200..adcaf2d76519f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala @@ -42,7 +42,7 @@ class DebuggingSuite extends SparkFunSuite with SharedSQLContext { test("debugCodegenStringSeq") { val res = codegenStringSeq(spark.range(10).groupBy("id").count().queryExecution.executedPlan) assert(res.length == 2) - assert(res.seq.forall{case (subtree, code) => + assert(res.forall{ case (subtree, code) => subtree.contains("Range") && code.contains("Object[]")}) } } From a349962eb04054a94de01b976b5c6217ea72519b Mon Sep 17 00:00:00 2001 From: Wang Gengliang Date: Fri, 23 Jun 2017 15:37:41 -0700 Subject: [PATCH 4/5] revise comment --- .../org/apache/spark/sql/execution/debug/package.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index 8136812af3e5e..f367db519c937 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -50,7 +50,12 @@ package object debug { // scalastyle:on println } - /** Generate codegen debug info */ + /** + * Get WholeStageCodegenExec subtrees and the codegen in a query plan into one String + * + * @param plan the query plan for codegen + * @return single String containing all WholeStageCodegen subtrees and corresponding codegen + */ def codegenString(plan: SparkPlan): String = { val codegenSeq = codegenStringSeq(plan) var output = s"Found ${codegenSeq.size} WholeStageCodegen subtrees.\n" From 7878c4985bc949fddefc1b93cec4ac0aff478ac8 Mon Sep 17 00:00:00 2001 From: Wang Gengliang Date: Fri, 23 Jun 2017 15:43:10 -0700 Subject: [PATCH 5/5] revise indent --- .../scala/org/apache/spark/sql/execution/debug/package.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index f367db519c937..a717cbd4a7df9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -83,8 +83,8 @@ package object debug { case s => s } codegenSubtrees.toSeq.map { subtree => - val (_, source) = subtree.doCodeGen() - (subtree.toString, CodeFormatter.format(source)) + val (_, source) = subtree.doCodeGen() + (subtree.toString, CodeFormatter.format(source)) } }