diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 969df4d92946b..0dfc4a32445db 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -153,9 +153,9 @@ objenesis-2.5.1.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.5.2-nohive.jar -orc-mapreduce-1.5.2-nohive.jar -orc-shims-1.5.2.jar +orc-core-1.5.3-nohive.jar +orc-mapreduce-1.5.3-nohive.jar +orc-shims-1.5.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index e827dc6036f85..77eaecc7e3fba 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -154,9 +154,9 @@ objenesis-2.5.1.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.5.2-nohive.jar -orc-mapreduce-1.5.2-nohive.jar -orc-shims-1.5.2.jar +orc-core-1.5.3-nohive.jar +orc-mapreduce-1.5.3-nohive.jar +orc-shims-1.5.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1 index 2b12c35d18e27..61744cba1a57c 100644 --- a/dev/deps/spark-deps-hadoop-3.1 +++ b/dev/deps/spark-deps-hadoop-3.1 @@ -172,9 +172,9 @@ okhttp-2.7.5.jar okhttp-3.8.1.jar okio-1.13.0.jar opencsv-2.3.jar -orc-core-1.5.2-nohive.jar -orc-mapreduce-1.5.2-nohive.jar -orc-shims-1.5.2.jar +orc-core-1.5.3-nohive.jar +orc-mapreduce-1.5.3-nohive.jar +orc-shims-1.5.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar diff --git a/pom.xml b/pom.xml index cc20c5cbf8887..956c35b13cb4f 100644 --- a/pom.xml +++ b/pom.xml @@ -131,7 +131,7 @@ 1.2.1 10.12.1.1 1.10.0 - 1.5.2 + 1.5.3 nohive 1.6.0 9.3.24.v20180605 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala index b6bb1d7ba4ce3..dc81c0585bf18 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala @@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.orc.OrcConf.COMPRESS import org.apache.orc.OrcFile +import org.apache.orc.OrcProto.ColumnEncoding.Kind.{DICTIONARY_V2, DIRECT, DIRECT_V2} import org.apache.orc.OrcProto.Stream.Kind import org.apache.orc.impl.RecordReaderImpl import org.scalatest.BeforeAndAfterAll @@ -115,6 +116,76 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { } } + protected def testSelectiveDictionaryEncoding(isSelective: Boolean) { + val tableName = "orcTable" + + withTempDir { dir => + withTable(tableName) { + val sqlStatement = orcImp match { + case "native" => + s""" + |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE) + |USING ORC + |OPTIONS ( + | path '${dir.toURI}', + | orc.dictionary.key.threshold '1.0', + | orc.column.encoding.direct 'uniqColumn' + |) + """.stripMargin + case "hive" => + s""" + |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE) + |STORED AS ORC + |LOCATION '${dir.toURI}' + |TBLPROPERTIES ( + | orc.dictionary.key.threshold '1.0', + | hive.exec.orc.dictionary.key.size.threshold '1.0', + | orc.column.encoding.direct 'uniqColumn' + |) + """.stripMargin + case impl => + throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl") + } + + sql(sqlStatement) + sql(s"INSERT INTO $tableName VALUES ('94086', 'random-uuid-string', 0.0)") + + val partFiles = dir.listFiles() + .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) + assert(partFiles.length === 1) + + val orcFilePath = new Path(partFiles.head.getAbsolutePath) + val readerOptions = OrcFile.readerOptions(new Configuration()) + val reader = OrcFile.createReader(orcFilePath, readerOptions) + var recordReader: RecordReaderImpl = null + try { + recordReader = reader.rows.asInstanceOf[RecordReaderImpl] + + // Check the kind + val stripe = recordReader.readStripeFooter(reader.getStripes.get(0)) + + // The encodings are divided into direct or dictionary-based categories and + // further refined as to whether they use RLE v1 or v2. RLE v1 is used by + // Hive 0.11 and RLE v2 is introduced in Hive 0.12 ORC with more improvements. + // For more details, see https://orc.apache.org/specification/ + assert(stripe.getColumns(1).getKind === DICTIONARY_V2) + if (isSelective) { + assert(stripe.getColumns(2).getKind === DIRECT_V2) + } else { + assert(stripe.getColumns(2).getKind === DICTIONARY_V2) + } + // Floating point types are stored with DIRECT encoding in IEEE 754 floating + // point bit layout. + assert(stripe.getColumns(3).getKind === DIRECT) + } finally { + if (recordReader != null) { + recordReader.close() + } + } + } + } + } + test("create temporary orc table") { checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10)) @@ -284,4 +355,8 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext { test("Check BloomFilter creation") { testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101 } + + test("Enforce direct encoding column-wise selectively") { + testSelectiveDictionaryEncoding(isSelective = true) + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala index c1ae2f6861cb8..7fefaf53939bd 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala @@ -182,4 +182,12 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton { } } } + + test("Enforce direct encoding column-wise selectively") { + Seq(true, false).foreach { convertMetastore => + withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") { + testSelectiveDictionaryEncoding(isSelective = false) + } + } + } }