Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions external/avro/benchmarks/AvroWriteBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Avro writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 3213 / 3373 4.9 204.3 1.0X
Output Single Double Column 3313 / 3345 4.7 210.7 1.0X
Output Int and String Column 7303 / 7316 2.2 464.3 0.4X
Output Partitions 5309 / 5691 3.0 337.5 0.6X
Output Buckets 7031 / 7557 2.2 447.0 0.5X

Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,19 @@ package org.apache.spark.sql.execution.benchmark

/**
* Benchmark to measure Avro data sources write performance.
* Usage:
* 1. with spark-submit: bin/spark-submit --class <this class> <spark sql test jar>
* 2. with sbt: build/sbt "avro/test:runMain <this class>"
* To run this benchmark:
* {{{
* 1. without sbt: bin/spark-submit --class <this class>
Copy link
Member

@wangyum wangyum Oct 29, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add avro:

bin/spark-submit --class <this class> --jars <spark core test jar>,<spark catalyst test jar>,<spark sql test jar> <spark avro test jar>

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hint an exception when run:

bin/spark-submit --class org.apache.spark.sql.execution.benchmark.AvroWriteBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar,./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar,./sql/core/target/spark-sql_2.11-3.0.0-SNAPSHOT-tests.jar ./external/avro/target/spark-avro_2.11-3.0.0-SNAPSHOT-tests.jar
Exception in thread "main" org.apache.spark.sql.AnalysisException: Failed to find data source: Avro. Avro is built-in but external data source module since Spark 2.4. Please deploy the application as per the deployment section of "Apache Avro Data Source Guide".;
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:94)
	at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:93)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:313)
	at org.apache.spark.sql.execution.command.CreateDataSourceTableCommand.run(createDataSourceTables.scala:78)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:79)
	at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:195)
	at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:195)
......

Copy link
Contributor Author

@yucai yucai Oct 30, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wangyum Good catch! I think it needs <spark avro jar>, added.

* --jars <spark core test jar>,<spark catalyst test jar>,
* <spark sql test jar>,<spark avro jar>
* <spark avro test jar>
* 2. build/sbt "sql/test:runMain <this class>"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/test:runMain <this class>"
* Results will be written to "benchmarks/AvroWriteBenchmark-results.txt".
* }}}
*/
object AvroWriteBenchmark extends DataSourceWriteBenchmark {
def main(args: Array[String]): Unit = {
/*
Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
Avro writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 2481 / 2499 6.3 157.8 1.0X
Output Single Double Column 2705 / 2710 5.8 172.0 0.9X
Output Int and String Column 5539 / 5639 2.8 352.2 0.4X
Output Partitions 4613 / 5004 3.4 293.3 0.5X
Output Buckets 5554 / 5561 2.8 353.1 0.4X
*/
runBenchmark("Avro")
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
runDataSourceBenchmark("Avro")
}
}
60 changes: 60 additions & 0 deletions sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
================================================================================================
Parquet writer benchmark
================================================================================================

OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Parquet writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 2354 / 2438 6.7 149.7 1.0X
Output Single Double Column 2462 / 2485 6.4 156.5 1.0X
Output Int and String Column 8083 / 8100 1.9 513.9 0.3X
Output Partitions 5015 / 5027 3.1 318.8 0.5X
Output Buckets 6883 / 6887 2.3 437.6 0.3X


================================================================================================
ORC writer benchmark
================================================================================================

OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
ORC writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 1769 / 1789 8.9 112.4 1.0X
Output Single Double Column 1989 / 2009 7.9 126.5 0.9X
Output Int and String Column 7323 / 7400 2.1 465.6 0.2X
Output Partitions 4374 / 4381 3.6 278.1 0.4X
Output Buckets 6086 / 6104 2.6 386.9 0.3X


================================================================================================
JSON writer benchmark
================================================================================================

OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 2954 / 4085 5.3 187.8 1.0X
Output Single Double Column 3832 / 3837 4.1 243.6 0.8X
Output Int and String Column 9591 / 10336 1.6 609.8 0.3X
Output Partitions 4956 / 4994 3.2 315.1 0.6X
Output Buckets 6608 / 6676 2.4 420.1 0.4X


================================================================================================
CSV writer benchmark
================================================================================================

OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
CSV writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 4118 / 4125 3.8 261.8 1.0X
Output Single Double Column 4888 / 4891 3.2 310.8 0.8X
Output Int and String Column 9788 / 9872 1.6 622.3 0.4X
Output Partitions 6578 / 6640 2.4 418.2 0.6X
Output Buckets 9125 / 9171 1.7 580.2 0.5X


Original file line number Diff line number Diff line change
Expand Up @@ -18,62 +18,40 @@ package org.apache.spark.sql.execution.benchmark

/**
* Benchmark to measure built-in data sources write performance.
* By default it measures 4 data source format: Parquet, ORC, JSON, CSV. Run it with spark-submit:
* spark-submit --class <this class> <spark sql test jar>
* Or with sbt:
* build/sbt "sql/test:runMain <this class>"
* To run this benchmark:
* {{{
* By default it measures 4 data source format: Parquet, ORC, JSON, CSV.
* 1. without sbt: bin/spark-submit --class <this class>
* --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar>
* 2. build/sbt "sql/test:runMain <this class>"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
* Results will be written to "benchmarks/BuiltInDataSourceWriteBenchmark-results.txt".
*
* To measure specified formats, run it with arguments.
* 1. without sbt:
* bin/spark-submit --class <this class> <spark sql test jar> format1 [format2] [...]
* 2. build/sbt "sql/test:runMain <this class> format1 [format2] [...]"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt
* "sql/test:runMain <this class> format1 [format2] [...]"
* Results will be written to "benchmarks/BuiltInDataSourceWriteBenchmark-results.txt".
* }}}
*
* To measure specified formats, run it with arguments:
* spark-submit --class <this class> <spark sql test jar> format1 [format2] [...]
* Or with sbt:
* build/sbt "sql/test:runMain <this class> format1 [format2] [...]"
*/
object BuiltInDataSourceWriteBenchmark extends DataSourceWriteBenchmark {
def main(args: Array[String]): Unit = {
val formats: Seq[String] = if (args.isEmpty) {
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
val formats: Seq[String] = if (mainArgs.isEmpty) {
Seq("Parquet", "ORC", "JSON", "CSV")
} else {
args
mainArgs
}

spark.conf.set("spark.sql.parquet.compression.codec", "snappy")
spark.conf.set("spark.sql.orc.compression.codec", "snappy")
/*
Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
Parquet writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 1815 / 1932 8.7 115.4 1.0X
Output Single Double Column 1877 / 1878 8.4 119.3 1.0X
Output Int and String Column 6265 / 6543 2.5 398.3 0.3X
Output Partitions 4067 / 4457 3.9 258.6 0.4X
Output Buckets 5608 / 5820 2.8 356.6 0.3X

ORC writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 1201 / 1239 13.1 76.3 1.0X
Output Single Double Column 1542 / 1600 10.2 98.0 0.8X
Output Int and String Column 6495 / 6580 2.4 412.9 0.2X
Output Partitions 3648 / 3842 4.3 231.9 0.3X
Output Buckets 5022 / 5145 3.1 319.3 0.2X

JSON writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 1988 / 2093 7.9 126.4 1.0X
Output Single Double Column 2854 / 2911 5.5 181.4 0.7X
Output Int and String Column 6467 / 6653 2.4 411.1 0.3X
Output Partitions 4548 / 5055 3.5 289.1 0.4X
Output Buckets 5664 / 5765 2.8 360.1 0.4X

CSV writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 3025 / 3190 5.2 192.3 1.0X
Output Single Double Column 3575 / 3634 4.4 227.3 0.8X
Output Int and String Column 7313 / 7399 2.2 464.9 0.4X
Output Partitions 5105 / 5190 3.1 324.6 0.6X
Output Buckets 6986 / 6992 2.3 444.1 0.4X
*/
formats.foreach { format =>
runBenchmark(format)
runBenchmark(s"$format writer benchmark") {
runDataSourceBenchmark(format)
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,9 @@
*/
package org.apache.spark.sql.execution.benchmark

import org.apache.spark.SparkConf
import org.apache.spark.benchmark.Benchmark
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf

trait DataSourceWriteBenchmark {
val conf = new SparkConf()
.setAppName("DataSourceWriteBenchmark")
.setIfMissing("spark.master", "local[1]")
.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true")

val spark = SparkSession.builder.config(conf).getOrCreate()
trait DataSourceWriteBenchmark extends SqlBasedBenchmark {

val tempTable = "temp"
val numRows = 1024 * 1024 * 15
Expand Down Expand Up @@ -75,7 +66,7 @@ trait DataSourceWriteBenchmark {
}
}

def runBenchmark(format: String): Unit = {
def runDataSourceBenchmark(format: String): Unit = {
val tableInt = "tableInt"
val tableDouble = "tableDouble"
val tableIntString = "tableIntString"
Expand All @@ -84,7 +75,7 @@ trait DataSourceWriteBenchmark {
withTempTable(tempTable) {
spark.range(numRows).createOrReplaceTempView(tempTable)
withTable(tableInt, tableDouble, tableIntString, tablePartition, tableBucket) {
val benchmark = new Benchmark(s"$format writer benchmark", numRows)
val benchmark = new Benchmark(s"$format writer benchmark", numRows, output = output)
writeNumeric(tableInt, format, benchmark, "Int")
writeNumeric(tableDouble, format, benchmark, "Double")
writeIntString(tableIntString, format, benchmark)
Expand Down