From 73c80cc3b947d5a94fe68d927727473b41644c08 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 1 Oct 2018 15:53:09 -0700 Subject: [PATCH 1/3] [SPARK-25589][SQL][TEST] Add BloomFilterBenchmark --- .../BloomFilterBenchmark-results.txt | 26 ++++++ .../benchmark/BloomFilterBenchmark.scala | 87 +++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 sql/core/benchmarks/BloomFilterBenchmark-results.txt create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt new file mode 100644 index 000000000000..3108fef94365 --- /dev/null +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -0,0 +1,26 @@ +================================================================================================ +ORC Write +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_181-b13 on Mac OS X 10.14 +Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz + +Write 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Without bloom filter 11884 / 12248 8.4 118.8 1.0X +With bloom filter 13963 / 13975 7.2 139.6 0.9X + + +================================================================================================ +ORC Read +================================================================================================ + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_181-b13 on Mac OS X 10.14 +Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz + +Read a row from 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------ +Without bloom filter 1160 / 1178 86.2 11.6 1.0X +With bloom filter 951 / 959 105.1 9.5 1.2X + + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala new file mode 100644 index 000000000000..921a881c7673 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import scala.util.Random + +import org.apache.spark.benchmark.Benchmark + +/** + * Benchmark to measure read performance with Bloom filters. + * + * Currently, only ORC supports bloom filters, we will add Parquet BM as soon as it becomes + * available. + * + * To run this benchmark: + * {{{ + * 1. without sbt: bin/spark-submit --class + * 2. build/sbt "sql/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/BloomFilterBenchmark-results.txt". + * }}} + */ +object BloomFilterBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + + private val scaleFactor = 100 + private val N = scaleFactor * 1000 * 1000 + private val df = spark.range(N).map(_ => Random.nextInt) + + private def writeBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + runBenchmark(s"ORC Write") { + val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + df.write.mode("overwrite").orc(path + "/withoutBF") + } + benchmark.addCase("With bloom filter") { _ => + df.write.mode("overwrite") + .option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + } + benchmark.run() + } + } + } + + private def readBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + df.write.orc(path + "/withoutBF") + df.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF") + + runBenchmark(s"ORC Read") { + val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + spark.read.orc(path + "/withoutBF").where("value = 0").count + } + benchmark.addCase("With bloom filter") { _ => + spark.read.orc(path + "/withBF").where("value = 0").count + } + benchmark.run() + } + } + } + + override def benchmark(): Unit = { + writeBenchmark() + readBenchmark() + } +} From fb116f0589471dc8d0b9787b308fc54fc5004cfc Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 2 Oct 2018 10:46:34 -0700 Subject: [PATCH 2/3] Rename benchmark to runBenchmarkSuite. --- .../spark/sql/execution/benchmark/BloomFilterBenchmark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index 921a881c7673..2f3caca849cd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -80,7 +80,7 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } - override def benchmark(): Unit = { + override def runBenchmarkSuite(): Unit = { writeBenchmark() readBenchmark() } From 6e917e57db33a92274e79de9906194fb650a9171 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Tue, 2 Oct 2018 18:37:04 +0000 Subject: [PATCH 3/3] Update result with EC2 r3.xlarge again --- .../BloomFilterBenchmark-results.txt | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/sql/core/benchmarks/BloomFilterBenchmark-results.txt b/sql/core/benchmarks/BloomFilterBenchmark-results.txt index 3108fef94365..2eeb26c899b4 100644 --- a/sql/core/benchmarks/BloomFilterBenchmark-results.txt +++ b/sql/core/benchmarks/BloomFilterBenchmark-results.txt @@ -2,25 +2,23 @@ ORC Write ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_181-b13 on Mac OS X 10.14 -Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Write 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Without bloom filter 11884 / 12248 8.4 118.8 1.0X -With bloom filter 13963 / 13975 7.2 139.6 0.9X +Without bloom filter 16765 / 17587 6.0 167.7 1.0X +With bloom filter 20060 / 20626 5.0 200.6 0.8X ================================================================================================ ORC Read ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_181-b13 on Mac OS X 10.14 -Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz - +OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Read a row from 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------ -Without bloom filter 1160 / 1178 86.2 11.6 1.0X -With bloom filter 951 / 959 105.1 9.5 1.2X +Without bloom filter 1857 / 1904 53.9 18.6 1.0X +With bloom filter 1399 / 1437 71.5 14.0 1.3X