From 230df385ac214e0446b58469bb708aedc070e442 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Tue, 27 Apr 2021 16:43:54 +0800 Subject: [PATCH] [NSE-283] Pick S3/CSV supports to OAP 1.1 (#284) * [NSE-237] Add ARROW_CSV=ON to default C++ build commands (#238) * [NSE-261] ArrowDataSource: Add S3 Support (#270) Closes #261 * [NSE-276] Add option to switch Hadoop version * [NSE-119] clean up on comments (#288) Signed-off-by: Yuan Zhou * [NSE-206]Update installation guide and configuration guide. (#289) * [NSE-206]Update installation guide and configuration guide. * Fix numaBinding setting issue. & Update description for protobuf * [NSE-206]Fix Prerequisite and Arrow Installation Steps. (#290) Co-authored-by: Yuan Co-authored-by: Wei-Ting Chen --- .github/workflows/tpch.yml | 2 +- .github/workflows/unittests.yml | 2 +- README.md | 27 ++++++-- arrow-data-source/.travis.yml | 2 +- arrow-data-source/README.md | 2 +- .../docs/ApacheArrowInstallation.md | 2 +- arrow-data-source/pom.xml | 68 ++++++++++++++++--- .../datasources/v2/arrow/ArrowUtils.scala | 5 ++ .../arrow/ArrowDataSourceTest.scala | 15 ++-- docs/ApacheArrowInstallation.md | 26 ++----- docs/Configuration.md | 42 +++++++++++- docs/Installation.md | 21 +++--- docs/Prerequisite.md | 42 ++++++++++-- native-sql-engine/core/pom.xml | 4 -- native-sql-engine/cpp/src/CMakeLists.txt | 1 + .../ext/hash_aggregate_kernel.cc | 5 +- pom.xml | 15 ++++ 17 files changed, 209 insertions(+), 72 deletions(-) diff --git a/.github/workflows/tpch.yml b/.github/workflows/tpch.yml index 650563ff2..b0d44c452 100644 --- a/.github/workflows/tpch.yml +++ b/.github/workflows/tpch.yml @@ -44,7 +44,7 @@ jobs: git clone https://github.com/oap-project/arrow.git cd arrow && git checkout arrow-3.0.0-oap-1.1 && cd cpp mkdir build && cd build - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DARROW_JEMALLOC=OFF && make -j2 + cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DARROW_JEMALLOC=OFF && make -j2 sudo make install cd ../../java mvn clean install -B -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index d1f95d2c1..b3bd6dac0 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -47,7 +47,7 @@ jobs: git clone https://github.com/oap-project/arrow.git cd arrow && git checkout arrow-3.0.0-oap-1.1 && cd cpp mkdir build && cd build - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2 + cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON -DGTEST_ROOT=/usr/src/gtest && make -j2 sudo make install - name: Run unit tests run: | diff --git a/README.md b/README.md index f392d2317..d2f035834 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,20 @@ We implemented columnar shuffle to improve the shuffle performance. With the col Please check the operator supporting details [here](./docs/operators.md) -## Build the Plugin +## How to use OAP: Native SQL Engine + +There are three ways to use OAP: Native SQL Engine, +1. Use precompiled jars +2. Building by Conda Environment +3. Building by Yourself + +### Use precompiled jars + +Please go to [OAP's Maven Central Repository](https://repo1.maven.org/maven2/com/intel/oap/) to find Native SQL Engine jars. +For usage, you will require below two jar files: +1. spark-arrow-datasource-standard--jar-with-dependencies.jar is located in com/intel/oap/spark-arrow-datasource-standard// +2. spark-columnar-core--jar-with-dependencies.jar is located in com/intel/oap/spark-columnar-core// +Please notice the files are fat jars shipped with our custom Arrow library and pre-compiled from our server(using GCC 9.3.0 and LLVM 7.0.1), which means you will require to pre-install GCC 9.3.0 and LLVM 7.0.1 in your system for normal usage. ### Building by Conda @@ -51,18 +64,18 @@ Then you can just skip below steps and jump to Getting Started [Get Started](#ge If you prefer to build from the source code on your hand, please follow below steps to set up your environment. -### Prerequisite +#### Prerequisite + There are some requirements before you build the project. Please check the document [Prerequisite](./docs/Prerequisite.md) and make sure you have already installed the software in your system. If you are running a SPARK Cluster, please make sure all the software are installed in every single node. -### Installation -Please check the document [Installation Guide](./docs/Installation.md) +#### Installation -### Configuration & Testing -Please check the document [Configuration Guide](./docs/Configuration.md) +Please check the document [Installation Guide](./docs/Installation.md) ## Get started + To enable OAP NativeSQL Engine, the previous built jar `spark-columnar-core--jar-with-dependencies.jar` should be added to Spark configuration. We also recommend to use `spark-arrow-datasource-standard--jar-with-dependencies.jar`. We will demonstrate an example by using both jar files. SPARK related options are: @@ -75,6 +88,8 @@ SPARK related options are: For Spark Standalone Mode, please set the above value as relative path to the jar file. For Spark Yarn Cluster Mode, please set the above value as absolute path to the jar file. +More Configuration, please check the document [Configuration Guide](./docs/Configuration.md) + Example to run Spark Shell with ArrowDataSource jar file ``` ${SPARK_HOME}/bin/spark-shell \ diff --git a/arrow-data-source/.travis.yml b/arrow-data-source/.travis.yml index 5b219ed66..5c938a101 100644 --- a/arrow-data-source/.travis.yml +++ b/arrow-data-source/.travis.yml @@ -26,7 +26,7 @@ jobs: - cd arrow && git checkout oap-master && cd cpp - sed -i "s/\${Python3_EXECUTABLE}/\/opt\/pyenv\/shims\/python3/g" CMakeLists.txt - mkdir build && cd build - - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON && make + - cmake .. -DARROW_JNI=ON -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_FILESYSTEM=ON -DARROW_WITH_SNAPPY=ON -DARROW_JSON=ON -DARROW_DATASET=ON -DARROW_WITH_LZ4=ON && make - sudo make install - cd ../../java - mvn clean install -q -P arrow-jni -am -Darrow.cpp.build.dir=/tmp/arrow/cpp/build/release/ -DskipTests -Dcheckstyle.skip diff --git a/arrow-data-source/README.md b/arrow-data-source/README.md index 0409f46c9..4fdfa8bd4 100644 --- a/arrow-data-source/README.md +++ b/arrow-data-source/README.md @@ -125,7 +125,7 @@ git clone -b https://github.com/Intel-bigdata/arrow.git cd arrow/cpp mkdir build cd build -cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. +cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. make // build and install arrow jvm library diff --git a/arrow-data-source/docs/ApacheArrowInstallation.md b/arrow-data-source/docs/ApacheArrowInstallation.md index 4e0647f74..06cee2312 100644 --- a/arrow-data-source/docs/ApacheArrowInstallation.md +++ b/arrow-data-source/docs/ApacheArrowInstallation.md @@ -42,7 +42,7 @@ git clone https://github.com/Intel-bigdata/arrow.git cd arrow && git checkout branch-0.17.0-oap-1.0 mkdir -p arrow/cpp/release-build cd arrow/cpp/release-build -cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. +cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. make -j make install diff --git a/arrow-data-source/pom.xml b/arrow-data-source/pom.xml index ad55360e0..f49659982 100644 --- a/arrow-data-source/pom.xml +++ b/arrow-data-source/pom.xml @@ -3,7 +3,7 @@ com.intel.oap native-sql-engine-parent 1.1.0 - + 4.0.0 com.intel.oap @@ -18,12 +18,6 @@ parquet - 2.12.10 - 2.12 - 3.0.0 - 3.0.0 - UTF-8 - UTF-8 ${arrow.script.dir} ${cpp_tests} ${build_arrow} @@ -48,6 +42,50 @@ + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.core + jackson-databind + + + javax.servlet + servlet-api + + + com.sun.jersey + jersey-core + + + com.sun.jersey + jersey-json + + + com.sun.jersey + jersey-server + + + commons-httpclient + commons-httpcore + + + + + org.apache.httpcomponents + httpcore + 4.2 + org.scala-lang scala-library @@ -61,7 +99,7 @@ org.apache.arrow - arrow-format + arrow-vector provided @@ -83,6 +121,12 @@ org.apache.spark spark-catalyst_2.12 ${spark.version} + + + org.apache.arrow + arrow-vector + + test-jar test @@ -90,6 +134,12 @@ org.apache.spark spark-sql_2.12 ${spark.version} + + + org.apache.arrow + arrow-vector + + test-jar test @@ -118,7 +168,7 @@ bash - ${script.dir}/build_arrow.sh + ${script.dir}/build_arrow.sh --tests=${datasource.cpp_tests} --build_arrow=${datasource.build_arrow} --static_arrow=${datasource.static_arrow} diff --git a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala index 4ef604114..4af788d0e 100644 --- a/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala +++ b/arrow-data-source/standard/src/main/scala/com/intel/oap/spark/sql/execution/datasources/v2/arrow/ArrowUtils.scala @@ -156,6 +156,11 @@ object ArrowUtils { private def rewriteUri(uriStr: String): String = { val uri = URI.create(uriStr) + if (uri.getScheme == "s3" || uri.getScheme == "s3a") { + val s3Rewritten = new URI("s3", uri.getAuthority, + uri.getPath, uri.getQuery, uri.getFragment).toString + return s3Rewritten + } val sch = uri.getScheme match { case "hdfs" => "hdfs" case "file" => "file" diff --git a/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala b/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala index f88e085fa..161d285c7 100644 --- a/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala +++ b/arrow-data-source/standard/src/test/scala/com/intel/oap/spark/sql/execution/datasources/arrow/ArrowDataSourceTest.scala @@ -106,10 +106,18 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession { verifyParquet( spark.read .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") .arrow(path)) } + test("simple sql query on s3") { + val path = "s3a://mlp-spark-dataset-bucket/test_arrowds_s3_small" + val frame = spark.read + .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") + .arrow(path) + frame.createOrReplaceTempView("stab") + assert(spark.sql("select id from stab").count() === 1000) + } + test("create catalog table") { val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) spark.catalog.createTable("ptab", path, "arrow") @@ -130,7 +138,6 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession { val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) val frame = spark.read .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") .arrow(path) frame.createOrReplaceTempView("ptab") verifyParquet(spark.sql("select * from ptab")) @@ -142,7 +149,6 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession { val path = ArrowDataSourceTest.locateResourcePath(parquetFile3) val frame = spark.read .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") .arrow(path) frame.createOrReplaceTempView("ptab") val sqlFrame = spark.sql("select * from ptab") @@ -163,7 +169,6 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession { val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) val frame = spark.read .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") .arrow(path) frame.createOrReplaceTempView("ptab") spark.sql("select col from ptab where col = 1").explain(true) @@ -178,7 +183,6 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession { val path = ArrowDataSourceTest.locateResourcePath(parquetFile2) val frame = spark.read .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") .arrow(path) frame.createOrReplaceTempView("ptab") val rows = spark.sql("select * from ptab where col = 'b'").collect() @@ -215,7 +219,6 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession { val path = ArrowDataSourceTest.locateResourcePath(parquetFile1) val frame = spark.read .option(ArrowOptions.KEY_ORIGINAL_FORMAT, "parquet") - .option(ArrowOptions.KEY_FILESYSTEM, "hdfs") .arrow(path) frame.createOrReplaceTempView("ptab") diff --git a/docs/ApacheArrowInstallation.md b/docs/ApacheArrowInstallation.md index 4e0647f74..cff4740e1 100644 --- a/docs/ApacheArrowInstallation.md +++ b/docs/ApacheArrowInstallation.md @@ -24,25 +24,16 @@ make install ``` # cmake: -Arrow will download package during compiling, in order to support SSL in cmake, build cmake is optional. -``` shell -wget https://github.com/Kitware/CMake/releases/download/v3.15.0-rc4/cmake-3.15.0-rc4.tar.gz -tar xf cmake-3.15.0-rc4.tar.gz -cd cmake-3.15.0-rc4/ -./bootstrap --system-curl --parallel=64 #parallel num depends on your server core number -make -j -make install -cmake --version -cmake version 3.15.0-rc4 -``` +Please make sure your cmake version is qualified based on the prerequisite. + # Apache Arrow ``` shell git clone https://github.com/Intel-bigdata/arrow.git -cd arrow && git checkout branch-0.17.0-oap-1.0 +cd arrow && git checkout mkdir -p arrow/cpp/release-build cd arrow/cpp/release-build -cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. +cmake -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JNI=ON -DARROW_DATASET=ON -DARROW_WITH_PROTOBUF=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_LZ4=ON -DARROW_FILESYSTEM=ON -DARROW_JSON=ON .. make -j make install @@ -60,11 +51,4 @@ mvn test -pl adapter/parquet -P arrow-jni mvn test -pl gandiva -P arrow-jni ``` -# Copy binary files to oap-native-sql resources directory -Because oap-native-sql plugin will build a stand-alone jar file with arrow dependency, if you choose to build Arrow by yourself, you have to copy below files as a replacement from the original one. -You can find those files in Apache Arrow installation directory or release directory. Below example assume Apache Arrow has been installed on /usr/local/lib64 -``` shell -cp /usr/local/lib64/libarrow.so.17 $native-sql-engine-dir/cpp/src/resources -cp /usr/local/lib64/libgandiva.so.17 $native-sql-engine-dir/cpp/src/resources -cp /usr/local/lib64/libparquet.so.17 $native-sql-engine-dir/cpp/src/resources -``` +After arrow installed in the specific directory, please make sure to set up -Dbuild_arrow=OFF -Darrow_root=/path/to/arrow when building Native SQL Engine. diff --git a/docs/Configuration.md b/docs/Configuration.md index b20b46f0e..8b6615687 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -1,6 +1,45 @@ # Spark Configurations for Native SQL Engine -Add below configuration to spark-defaults.conf +There are many configuration could impact the Native SQL Engine performance and can be fine tune in Spark. +You can add these configuration into spark-defaults.conf to enable or disable the setting. + +| Parameters | Description | Recommend Setting | +| ---------- | ----------- | --------------- | +| spark.driver.extraClassPath | To add Arrow Data Source and Native SQL Engine jar file in Spark Driver | /path/to/jar_file1:/path/to/jar_file2 | +| spark.executor.extraClassPath | To add Arrow Data Source and Native SQL Engine jar file in Spark Executor | /path/to/jar_file1:/path/to/jar_file2 | +| spark.executorEnv.LIBARROW_DIR | To set up the location of Arrow library, by default it will search the loation of jar to be uncompressed | /path/to/arrow_library/ | +| spark.executorEnv.CC | To set up the location of gcc | /path/to/gcc/ | +| spark.executor.memory| To set up how much memory to be used for Spark Executor. | | +| spark.memory.offHeap.size| To set up how much memory to be used for Java OffHeap.
Please notice Native SQL Engine will leverage this setting to allocate memory space for native usage even offHeap is disabled.
The value is based on your system and it is recommended to set it larger if you are facing Out of Memory issue in Native SQL Engine | 30G | +| spark.executor.extraJavaOptions | To set up how much Direct Memory to be used for Native SQL Engine. The value is based on your system and it is recommended to set it larger if you are facing Out of Memory issue in Native SQL Engine | -XX:MaxDirectMemorySize=30G | +| spark.sql.sources.useV1SourceList | Choose to use V1 source | avro | +| spark.sql.join.preferSortMergeJoin | To turn off preferSortMergeJoin in Spark | false | +| spark.sql.extensions | To turn on Native SQL Engine Plugin | com.intel.oap.ColumnarPlugin | +| spark.shuffle.manager | To turn on Native SQL Engine Columnar Shuffle Plugin | org.apache.spark.shuffle.sort.ColumnarShuffleManager | +| spark.oap.sql.columnar.batchscan | Enable or Disable Columnar Batchscan, default is true | true | +| spark.oap.sql.columnar.hashagg | Enable or Disable Columnar Hash Aggregate, default is true | true | +| spark.oap.sql.columnar.projfilter | Enable or Disable Columnar Project and Filter, default is true | true | +| spark.oap.sql.columnar.codegen.sort | Enable or Disable Columnar Sort, default is true | true | +| spark.oap.sql.columnar.window | Enable or Disable Columnar Window, default is true | true | +| spark.oap.sql.columnar.shuffledhashjoin | Enable or Disable ShffuledHashJoin, default is true | true | +| spark.oap.sql.columnar.sortmergejoin | Enable or Disable Columnar Sort Merge Join, default is true | true | +| spark.oap.sql.columnar.union | Enable or Disable Columnar Union, default is true | true | +| spark.oap.sql.columnar.expand | Enable or Disable Columnar Expand, default is true | true | +| spark.oap.sql.columnar.broadcastexchange | Enable or Disable Columnar Broadcast Exchange, default is true | true | +| spark.oap.sql.columnar.nanCheck | Enable or Disable Nan Check, default is true | true | +| spark.oap.sql.columnar.hashCompare | Enable or Disable Hash Compare in HashJoins or HashAgg, default is true | true | +| spark.oap.sql.columnar.broadcastJoin | Enable or Disable Columnar BradcastHashJoin, default is true | true | +| spark.oap.sql.columnar.wholestagecodegen | Enable or Disable Columnar WholeStageCodeGen, default is true | true | +| spark.oap.sql.columnar.preferColumnar | Enable or Disable Columnar Operators, default is false.
This parameter could impact the performance in different case. In some cases, to set false can get some performance boost. | false | +| spark.oap.sql.columnar.joinOptimizationLevel | Fallback to row operators if there are several continous joins | 6 | +| spark.sql.execution.arrow.maxRecordsPerBatch | Set up the Max Records per Batch | 10000 | +| spark.oap.sql.columnar.wholestagecodegen.breakdownTime | Enable or Disable metrics in Columnar WholeStageCodeGen | false | +| spark.oap.sql.columnar.tmp_dir | Set up a folder to store the codegen files | /tmp | +| spark.oap.sql.columnar.shuffle.customizedCompression.codec | Set up the codec to be used for Columnar Shuffle, default is lz4| lz4 | +| spark.oap.sql.columnar.numaBinding | Set up NUMABinding, default is false| true | +| spark.oap.sql.columnar.coreRange | Set up the core range for NUMABinding, only works when numaBinding set to true.
The setting is based on the number of cores in your system. Use 72 cores as an example. | 0-17,36-53 |18-35,54-71 | + +Below is an example for spark-default.conf, if you are using conda to install OAP project. ``` ##### Columnar Process Configuration @@ -26,4 +65,3 @@ export CC=$HOME/miniconda2/envs/oapenv/bin/gcc export LIBARROW_DIR=$HOME/miniconda2/envs/oapenv/ ``` -About arrow-data-source.jar, you can refer [Unified Arrow Data Source ](https://oap-project.github.io/arrow-data-source/). diff --git a/docs/Installation.md b/docs/Installation.md index 604829663..bb5a1ec11 100644 --- a/docs/Installation.md +++ b/docs/Installation.md @@ -14,17 +14,20 @@ yum install gmock ``` shell git clone -b ${version} https://github.com/oap-project/native-sql-engine.git cd oap-native-sql -cd cpp/ -mkdir build/ -cd build/ -cmake .. -DTESTS=ON -make -j +mvn clean package -DskipTests -Dcpp_tests=OFF -Dbuild_arrow=ON -Dcheckstyle.skip ``` -``` shell -cd ../../core/ -mvn clean package -DskipTests -``` +Based on the different environment, there are some parameters can be set via -D with mvn. + +| Parameters | Description | Default Value | +| ---------- | ----------- | ------------- | +| cpp_tests | Enable or Disable CPP Tests | False | +| build_arrow | Build Arrow from Source | True | +| arrow_root | When build_arrow set to False, arrow_root will be enabled to find the location of your existing arrow library. | /usr/local | +| build_protobuf | Build Protobuf from Source. If set to False, default library path will be used to find protobuf library. | True | + +When build_arrow set to True, the build_arrow.sh will be launched and compile a custom arrow library from [OAP Arrow](https://github.com/oap-project/arrow) +If you wish to change any parameters from Arrow, you can change it from the build_arrow.sh script under native-sql-enge/arrow-data-source/script/. ### Additonal Notes [Notes for Installation Issues](./InstallationNotes.md) diff --git a/docs/Prerequisite.md b/docs/Prerequisite.md index 5ff82aa1b..e678a6340 100644 --- a/docs/Prerequisite.md +++ b/docs/Prerequisite.md @@ -4,12 +4,13 @@ There are some requirements before you build the project. Please make sure you have already installed the software in your system. 1. gcc 9.3 or higher version -2. java8 OpenJDK -> yum install java-1.8.0-openjdk -3. cmake 3.2 or higher version -4. maven 3.1.1 or higher version -5. Hadoop 2.7.5 or higher version -6. Spark 3.0.0 or higher version -7. Intel Optimized Arrow 0.17.0 +2. LLVM 7.0 or higher version +3. java8 OpenJDK -> yum install java-1.8.0-openjdk +4. cmake 3.16 or higher version +5. maven 3.1.1 or higher version +6. Hadoop 2.7.5 or higher version +7. Spark 3.0.0 or higher version +8. Intel Optimized Arrow 0.17.0 ## gcc installation @@ -44,6 +45,33 @@ Please remember to add and source the setup in your environment files such as /e //Verify if gcc has been installation Use gcc -v command to verify if your gcc version is correct.(Must larger than 9.3) +## LLVM 7.0 installation + +Arrow Gandiva depends on LLVM, and I noticed current version strictly depends on llvm7.0 if you installed any other version rather than 7.0, it will fail. +``` shell +wget http://releases.llvm.org/7.0.1/llvm-7.0.1.src.tar.xz +tar xf llvm-7.0.1.src.tar.xz +cd llvm-7.0.1.src/ +cd tools +wget http://releases.llvm.org/7.0.1/cfe-7.0.1.src.tar.xz +tar xf cfe-7.0.1.src.tar.xz +mv cfe-7.0.1.src clang +cd .. +mkdir build +cd build +cmake .. -DCMAKE_BUILD_TYPE=Release +cmake --build . -j +cmake --build . --target install +# check if clang has also been compiled, if no +cd tools/clang +mkdir build +cd build +cmake .. +make -j +make install +``` + + ## cmake installation If you are facing some trouble when installing cmake, please follow below steps to install cmake. @@ -146,6 +174,6 @@ Please notes: If you choose to use libhdfs3.so, there are some other dependency ## Intel Optimized Apache Arrow Installation -Intel Optimized Apache Arrow is MANDATORY to be used. However, we have a bundle a compiled arrow libraries(libarrow, libgandiva, libparquet) built by GCC9.3 included in the cpp/src/resources directory. +During the mvn compile command, it will launch a script(build_arrow.sh) to help install and compile a Intel custom Arrow library. If you wish to build Apache Arrow by yourself, please follow the guide to build and install Apache Arrow [ArrowInstallation](./ApacheArrowInstallation.md) diff --git a/native-sql-engine/core/pom.xml b/native-sql-engine/core/pom.xml index 5064374d3..92d823eaa 100644 --- a/native-sql-engine/core/pom.xml +++ b/native-sql-engine/core/pom.xml @@ -30,10 +30,6 @@ ../cpp/ ../cpp/build/releases/ - 3.0.0 - 3.0.0 - 2.12 - 2.12.8 none package provided diff --git a/native-sql-engine/cpp/src/CMakeLists.txt b/native-sql-engine/cpp/src/CMakeLists.txt index bf4bd1329..24282816d 100644 --- a/native-sql-engine/cpp/src/CMakeLists.txt +++ b/native-sql-engine/cpp/src/CMakeLists.txt @@ -141,6 +141,7 @@ macro(build_arrow STATIC_ARROW) -DARROW_GANDIVA_JAVA=ON -DARROW_GANDIVA=ON -DARROW_PARQUET=ON + -DARROW_CSV=ON -DARROW_HDFS=ON -DARROW_BOOST_USE_SHARED=OFF -DARROW_JNI=ON diff --git a/native-sql-engine/cpp/src/codegen/arrow_compute/ext/hash_aggregate_kernel.cc b/native-sql-engine/cpp/src/codegen/arrow_compute/ext/hash_aggregate_kernel.cc index 8486dfcc9..9b565a7d2 100644 --- a/native-sql-engine/cpp/src/codegen/arrow_compute/ext/hash_aggregate_kernel.cc +++ b/native-sql-engine/cpp/src/codegen/arrow_compute/ext/hash_aggregate_kernel.cc @@ -804,7 +804,9 @@ class HashAggregateKernel::Impl { post_process_projector_(post_process_projector), action_impl_list_(action_impl_list) { aggr_hash_table_ = std::make_shared(ctx->memory_pool()); +#ifdef DEBUG std::cout << "using string hashagg res" << std::endl; +#endif batch_size_ = GetBatchSize(); if (key_index_list.size() > 1) { aggr_key_unsafe_row = std::make_shared(key_index_list.size()); @@ -859,9 +861,6 @@ class HashAggregateKernel::Impl { typed_key_in->null_count() == 0 ? true : !typed_key_in->IsNull(i); } - // for (int n = 0; n < aggr_key.size(); ++n) printf("%0X ", - // *(aggr_key.data() + n)); std::cout << std::endl; - // 3. get key from hash_table int memo_index = 0; if (!aggr_key_validity) { diff --git a/pom.xml b/pom.xml index da47c3ff3..b1ef64276 100644 --- a/pom.xml +++ b/pom.xml @@ -30,6 +30,13 @@ + 2.12.10 + 2.12 + 3.0.0 + 3.0.0 + 2.7.4 + UTF-8 + UTF-8 ${project.basedir}/script OFF ON @@ -44,4 +51,12 @@ native-sql-engine/core + + + hadoop-3.2 + + 3.2.0 + + +