diff --git a/external/avro/benchmarks/AvroWriteBenchmark-results.txt b/external/avro/benchmarks/AvroWriteBenchmark-results.txt new file mode 100644 index 000000000000..fb2a77333eec --- /dev/null +++ b/external/avro/benchmarks/AvroWriteBenchmark-results.txt @@ -0,0 +1,10 @@ +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Avro writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Output Single Int Column 3213 / 3373 4.9 204.3 1.0X +Output Single Double Column 3313 / 3345 4.7 210.7 1.0X +Output Int and String Column 7303 / 7316 2.2 464.3 0.4X +Output Partitions 5309 / 5691 3.0 337.5 0.6X +Output Buckets 7031 / 7557 2.2 447.0 0.5X + diff --git a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala b/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala index df13b4a1c2d3..0b11434757c9 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala +++ b/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala @@ -19,22 +19,19 @@ package org.apache.spark.sql.execution.benchmark /** * Benchmark to measure Avro data sources write performance. - * Usage: - * 1. with spark-submit: bin/spark-submit --class - * 2. with sbt: build/sbt "avro/test:runMain " + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * --jars ,, + * , + * + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/test:runMain " + * Results will be written to "benchmarks/AvroWriteBenchmark-results.txt". + * }}} */ object AvroWriteBenchmark extends DataSourceWriteBenchmark { - def main(args: Array[String]): Unit = { - /* - Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz - Avro writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Output Single Int Column 2481 / 2499 6.3 157.8 1.0X - Output Single Double Column 2705 / 2710 5.8 172.0 0.9X - Output Int and String Column 5539 / 5639 2.8 352.2 0.4X - Output Partitions 4613 / 5004 3.4 293.3 0.5X - Output Buckets 5554 / 5561 2.8 353.1 0.4X - */ - runBenchmark("Avro") + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runDataSourceBenchmark("Avro") } } diff --git a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt new file mode 100644 index 000000000000..9d656fc10dce --- /dev/null +++ b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt @@ -0,0 +1,60 @@ +================================================================================================ +Parquet writer benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Parquet writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Output Single Int Column 2354 / 2438 6.7 149.7 1.0X +Output Single Double Column 2462 / 2485 6.4 156.5 1.0X +Output Int and String Column 8083 / 8100 1.9 513.9 0.3X +Output Partitions 5015 / 5027 3.1 318.8 0.5X +Output Buckets 6883 / 6887 2.3 437.6 0.3X + + +================================================================================================ +ORC writer benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +ORC writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Output Single Int Column 1769 / 1789 8.9 112.4 1.0X +Output Single Double Column 1989 / 2009 7.9 126.5 0.9X +Output Int and String Column 7323 / 7400 2.1 465.6 0.2X +Output Partitions 4374 / 4381 3.6 278.1 0.4X +Output Buckets 6086 / 6104 2.6 386.9 0.3X + + +================================================================================================ +JSON writer benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +JSON writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Output Single Int Column 2954 / 4085 5.3 187.8 1.0X +Output Single Double Column 3832 / 3837 4.1 243.6 0.8X +Output Int and String Column 9591 / 10336 1.6 609.8 0.3X +Output Partitions 4956 / 4994 3.2 315.1 0.6X +Output Buckets 6608 / 6676 2.4 420.1 0.4X + + +================================================================================================ +CSV writer benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +CSV writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Output Single Int Column 4118 / 4125 3.8 261.8 1.0X +Output Single Double Column 4888 / 4891 3.2 310.8 0.8X +Output Int and String Column 9788 / 9872 1.6 622.3 0.4X +Output Partitions 6578 / 6640 2.4 418.2 0.6X +Output Buckets 9125 / 9171 1.7 580.2 0.5X + + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala index 2de516c19da9..cd97324c997f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala @@ -18,62 +18,40 @@ package org.apache.spark.sql.execution.benchmark /** * Benchmark to measure built-in data sources write performance. - * By default it measures 4 data source format: Parquet, ORC, JSON, CSV. Run it with spark-submit: - * spark-submit --class - * Or with sbt: - * build/sbt "sql/test:runMain " + * To run this benchmark: + * {{{ + * By default it measures 4 data source format: Parquet, ORC, JSON, CSV. + * 1. without sbt: bin/spark-submit --class + * --jars , + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/BuiltInDataSourceWriteBenchmark-results.txt". + * + * To measure specified formats, run it with arguments. + * 1. without sbt: + * bin/spark-submit --class format1 [format2] [...] + * 2. build/sbt "sql/test:runMain format1 [format2] [...]" + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt + * "sql/test:runMain format1 [format2] [...]" + * Results will be written to "benchmarks/BuiltInDataSourceWriteBenchmark-results.txt". + * }}} * - * To measure specified formats, run it with arguments: - * spark-submit --class format1 [format2] [...] - * Or with sbt: - * build/sbt "sql/test:runMain format1 [format2] [...]" */ object BuiltInDataSourceWriteBenchmark extends DataSourceWriteBenchmark { - def main(args: Array[String]): Unit = { - val formats: Seq[String] = if (args.isEmpty) { + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + val formats: Seq[String] = if (mainArgs.isEmpty) { Seq("Parquet", "ORC", "JSON", "CSV") } else { - args + mainArgs } spark.conf.set("spark.sql.parquet.compression.codec", "snappy") spark.conf.set("spark.sql.orc.compression.codec", "snappy") - /* - Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz - Parquet writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Output Single Int Column 1815 / 1932 8.7 115.4 1.0X - Output Single Double Column 1877 / 1878 8.4 119.3 1.0X - Output Int and String Column 6265 / 6543 2.5 398.3 0.3X - Output Partitions 4067 / 4457 3.9 258.6 0.4X - Output Buckets 5608 / 5820 2.8 356.6 0.3X - - ORC writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Output Single Int Column 1201 / 1239 13.1 76.3 1.0X - Output Single Double Column 1542 / 1600 10.2 98.0 0.8X - Output Int and String Column 6495 / 6580 2.4 412.9 0.2X - Output Partitions 3648 / 3842 4.3 231.9 0.3X - Output Buckets 5022 / 5145 3.1 319.3 0.2X - - JSON writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Output Single Int Column 1988 / 2093 7.9 126.4 1.0X - Output Single Double Column 2854 / 2911 5.5 181.4 0.7X - Output Int and String Column 6467 / 6653 2.4 411.1 0.3X - Output Partitions 4548 / 5055 3.5 289.1 0.4X - Output Buckets 5664 / 5765 2.8 360.1 0.4X - CSV writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------------ - Output Single Int Column 3025 / 3190 5.2 192.3 1.0X - Output Single Double Column 3575 / 3634 4.4 227.3 0.8X - Output Int and String Column 7313 / 7399 2.2 464.9 0.4X - Output Partitions 5105 / 5190 3.1 324.6 0.6X - Output Buckets 6986 / 6992 2.3 444.1 0.4X - */ formats.foreach { format => - runBenchmark(format) + runBenchmark(s"$format writer benchmark") { + runDataSourceBenchmark(format) + } } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala index 994d6b5b7d33..405d60794ede 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala @@ -16,18 +16,9 @@ */ package org.apache.spark.sql.execution.benchmark -import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.internal.SQLConf -trait DataSourceWriteBenchmark { - val conf = new SparkConf() - .setAppName("DataSourceWriteBenchmark") - .setIfMissing("spark.master", "local[1]") - .set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") - - val spark = SparkSession.builder.config(conf).getOrCreate() +trait DataSourceWriteBenchmark extends SqlBasedBenchmark { val tempTable = "temp" val numRows = 1024 * 1024 * 15 @@ -75,7 +66,7 @@ trait DataSourceWriteBenchmark { } } - def runBenchmark(format: String): Unit = { + def runDataSourceBenchmark(format: String): Unit = { val tableInt = "tableInt" val tableDouble = "tableDouble" val tableIntString = "tableIntString" @@ -84,7 +75,7 @@ trait DataSourceWriteBenchmark { withTempTable(tempTable) { spark.range(numRows).createOrReplaceTempView(tempTable) withTable(tableInt, tableDouble, tableIntString, tablePartition, tableBucket) { - val benchmark = new Benchmark(s"$format writer benchmark", numRows) + val benchmark = new Benchmark(s"$format writer benchmark", numRows, output = output) writeNumeric(tableInt, format, benchmark, "Int") writeNumeric(tableDouble, format, benchmark, "Double") writeIntString(tableIntString, format, benchmark)