apache · yucai · Oct 26, 2018 · Oct 29, 2018 · Oct 29, 2018 · Oct 29, 2018
diff --git a/external/avro/benchmarks/AvroWriteBenchmark-results.txt b/external/avro/benchmarks/AvroWriteBenchmark-results.txt
@@ -0,0 +1,10 @@
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Avro writer benchmark:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      3213 / 3373          4.9         204.3       1.0X
+Output Single Double Column                   3313 / 3345          4.7         210.7       1.0X
+Output Int and String Column                  7303 / 7316          2.2         464.3       0.4X
+Output Partitions                             5309 / 5691          3.0         337.5       0.6X
+Output Buckets                                7031 / 7557          2.2         447.0       0.5X
+
diff --git a/...nal/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala b/...nal/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala
@@ -19,22 +19,19 @@ package org.apache.spark.sql.execution.benchmark
 
 /**
  * Benchmark to measure Avro data sources write performance.
- * Usage:
- * 1. with spark-submit: bin/spark-submit --class <this class> <spark sql test jar>
- * 2. with sbt: build/sbt "avro/test:runMain <this class>"
+ * To run this benchmark:
+ * {{{
+ *   1. without sbt: bin/spark-submit --class <this class>
+ *        --jars <spark core test jar>,<spark catalyst test jar>,
+  *              <spark sql test jar>,<spark avro jar>
+ *        <spark avro test jar>
+ *   2. build/sbt "sql/test:runMain <this class>"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/test:runMain <this class>"
+ *      Results will be written to "benchmarks/AvroWriteBenchmark-results.txt".
+ *  }}}
  */
 object AvroWriteBenchmark extends DataSourceWriteBenchmark {
-  def main(args: Array[String]): Unit = {
-    /*
-    Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
-    Avro writer benchmark:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Output Single Int Column                      2481 / 2499          6.3         157.8       1.0X
-    Output Single Double Column                   2705 / 2710          5.8         172.0       0.9X
-    Output Int and String Column                  5539 / 5639          2.8         352.2       0.4X
-    Output Partitions                             4613 / 5004          3.4         293.3       0.5X
-    Output Buckets                                5554 / 5561          2.8         353.1       0.4X
-    */
-    runBenchmark("Avro")
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    runDataSourceBenchmark("Avro")
   }
 }
diff --git a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt
@@ -0,0 +1,60 @@
+================================================================================================
+Parquet writer benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Parquet writer benchmark:                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      2354 / 2438          6.7         149.7       1.0X
+Output Single Double Column                   2462 / 2485          6.4         156.5       1.0X
+Output Int and String Column                  8083 / 8100          1.9         513.9       0.3X
+Output Partitions                             5015 / 5027          3.1         318.8       0.5X
+Output Buckets                                6883 / 6887          2.3         437.6       0.3X
+
+
+================================================================================================
+ORC writer benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+ORC writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      1769 / 1789          8.9         112.4       1.0X
+Output Single Double Column                   1989 / 2009          7.9         126.5       0.9X
+Output Int and String Column                  7323 / 7400          2.1         465.6       0.2X
+Output Partitions                             4374 / 4381          3.6         278.1       0.4X
+Output Buckets                                6086 / 6104          2.6         386.9       0.3X
+
+
+================================================================================================
+JSON writer benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+JSON writer benchmark:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      2954 / 4085          5.3         187.8       1.0X
+Output Single Double Column                   3832 / 3837          4.1         243.6       0.8X
+Output Int and String Column                 9591 / 10336          1.6         609.8       0.3X
+Output Partitions                             4956 / 4994          3.2         315.1       0.6X
+Output Buckets                                6608 / 6676          2.4         420.1       0.4X
+
+
+================================================================================================
+CSV writer benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+CSV writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      4118 / 4125          3.8         261.8       1.0X
+Output Single Double Column                   4888 / 4891          3.2         310.8       0.8X
+Output Int and String Column                  9788 / 9872          1.6         622.3       0.4X
+Output Partitions                             6578 / 6640          2.4         418.2       0.6X
+Output Buckets                                9125 / 9171          1.7         580.2       0.5X
+
+
diff --git a/...test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala b/...test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala
@@ -18,62 +18,40 @@ package org.apache.spark.sql.execution.benchmark
 
 /**
  * Benchmark to measure built-in data sources write performance.
- * By default it measures 4 data source format: Parquet, ORC, JSON, CSV. Run it with spark-submit:
- *   spark-submit --class <this class> <spark sql test jar>
- * Or with sbt:
- *   build/sbt "sql/test:runMain <this class>"
+ * To run this benchmark:
+ * {{{
+ *   By default it measures 4 data source format: Parquet, ORC, JSON, CSV.
+ *   1. without sbt: bin/spark-submit --class <this class>
+ *        --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar>
+ *   2. build/sbt "sql/test:runMain <this class>"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
+ *      Results will be written to "benchmarks/BuiltInDataSourceWriteBenchmark-results.txt".
+ *
+ *   To measure specified formats, run it with arguments.
+ *   1. without sbt:
+ *        bin/spark-submit --class <this class> <spark sql test jar> format1 [format2] [...]
+ *   2. build/sbt "sql/test:runMain <this class> format1 [format2] [...]"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt
+ *        "sql/test:runMain <this class> format1 [format2] [...]"
+ *      Results will be written to "benchmarks/BuiltInDataSourceWriteBenchmark-results.txt".
+ * }}}
  *
- * To measure specified formats, run it with arguments:
- *   spark-submit --class <this class> <spark sql test jar> format1 [format2] [...]
- * Or with sbt:
- *   build/sbt "sql/test:runMain <this class> format1 [format2] [...]"
  */
 object BuiltInDataSourceWriteBenchmark extends DataSourceWriteBenchmark {
-  def main(args: Array[String]): Unit = {
-    val formats: Seq[String] = if (args.isEmpty) {
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    val formats: Seq[String] = if (mainArgs.isEmpty) {
       Seq("Parquet", "ORC", "JSON", "CSV")
     } else {
-      args
+      mainArgs
     }
 
     spark.conf.set("spark.sql.parquet.compression.codec", "snappy")
     spark.conf.set("spark.sql.orc.compression.codec", "snappy")
-    /*
-    Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
-    Parquet writer benchmark:                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Output Single Int Column                      1815 / 1932          8.7         115.4       1.0X
-    Output Single Double Column                   1877 / 1878          8.4         119.3       1.0X
-    Output Int and String Column                  6265 / 6543          2.5         398.3       0.3X
-    Output Partitions                             4067 / 4457          3.9         258.6       0.4X
-    Output Buckets                                5608 / 5820          2.8         356.6       0.3X
-
-    ORC writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Output Single Int Column                      1201 / 1239         13.1          76.3       1.0X
-    Output Single Double Column                   1542 / 1600         10.2          98.0       0.8X
-    Output Int and String Column                  6495 / 6580          2.4         412.9       0.2X
-    Output Partitions                             3648 / 3842          4.3         231.9       0.3X
-    Output Buckets                                5022 / 5145          3.1         319.3       0.2X
-
-    JSON writer benchmark:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Output Single Int Column                      1988 / 2093          7.9         126.4       1.0X
-    Output Single Double Column                   2854 / 2911          5.5         181.4       0.7X
-    Output Int and String Column                  6467 / 6653          2.4         411.1       0.3X
-    Output Partitions                             4548 / 5055          3.5         289.1       0.4X
-    Output Buckets                                5664 / 5765          2.8         360.1       0.4X
 
-    CSV writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-    ------------------------------------------------------------------------------------------------
-    Output Single Int Column                      3025 / 3190          5.2         192.3       1.0X
-    Output Single Double Column                   3575 / 3634          4.4         227.3       0.8X
-    Output Int and String Column                  7313 / 7399          2.2         464.9       0.4X
-    Output Partitions                             5105 / 5190          3.1         324.6       0.6X
-    Output Buckets                                6986 / 6992          2.3         444.1       0.4X
-    */
     formats.foreach { format =>
-      runBenchmark(format)
+      runBenchmark(s"$format writer benchmark") {
+        runDataSourceBenchmark(format)
+      }
     }
   }
 }
diff --git a/...re/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala b/...re/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala
@@ -16,18 +16,9 @@
  */
 package org.apache.spark.sql.execution.benchmark
 
-import org.apache.spark.SparkConf
 import org.apache.spark.benchmark.Benchmark
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.internal.SQLConf
 
-trait DataSourceWriteBenchmark {
-  val conf = new SparkConf()
-    .setAppName("DataSourceWriteBenchmark")
-    .setIfMissing("spark.master", "local[1]")
-    .set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true")
-
-  val spark = SparkSession.builder.config(conf).getOrCreate()
+trait DataSourceWriteBenchmark extends SqlBasedBenchmark {
 
   val tempTable = "temp"
   val numRows = 1024 * 1024 * 15
@@ -75,7 +66,7 @@ trait DataSourceWriteBenchmark {
     }
   }
 
-  def runBenchmark(format: String): Unit = {
+  def runDataSourceBenchmark(format: String): Unit = {
     val tableInt = "tableInt"
     val tableDouble = "tableDouble"
     val tableIntString = "tableIntString"
@@ -84,7 +75,7 @@ trait DataSourceWriteBenchmark {
     withTempTable(tempTable) {
       spark.range(numRows).createOrReplaceTempView(tempTable)
       withTable(tableInt, tableDouble, tableIntString, tablePartition, tableBucket) {
-        val benchmark = new Benchmark(s"$format writer benchmark", numRows)
+        val benchmark = new Benchmark(s"$format writer benchmark", numRows, output = output)
         writeNumeric(tableInt, format, benchmark, "Int")
         writeNumeric(tableDouble, format, benchmark, "Double")
         writeIntString(tableIntString, format, benchmark)