diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 969df4d92946b..0dfc4a32445db 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -153,9 +153,9 @@ objenesis-2.5.1.jar
okhttp-3.8.1.jar
okio-1.13.0.jar
opencsv-2.3.jar
-orc-core-1.5.2-nohive.jar
-orc-mapreduce-1.5.2-nohive.jar
-orc-shims-1.5.2.jar
+orc-core-1.5.3-nohive.jar
+orc-mapreduce-1.5.3-nohive.jar
+orc-shims-1.5.3.jar
oro-2.0.8.jar
osgi-resource-locator-1.0.1.jar
paranamer-2.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index e827dc6036f85..77eaecc7e3fba 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -154,9 +154,9 @@ objenesis-2.5.1.jar
okhttp-3.8.1.jar
okio-1.13.0.jar
opencsv-2.3.jar
-orc-core-1.5.2-nohive.jar
-orc-mapreduce-1.5.2-nohive.jar
-orc-shims-1.5.2.jar
+orc-core-1.5.3-nohive.jar
+orc-mapreduce-1.5.3-nohive.jar
+orc-shims-1.5.3.jar
oro-2.0.8.jar
osgi-resource-locator-1.0.1.jar
paranamer-2.8.jar
diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
index 2b12c35d18e27..61744cba1a57c 100644
--- a/dev/deps/spark-deps-hadoop-3.1
+++ b/dev/deps/spark-deps-hadoop-3.1
@@ -172,9 +172,9 @@ okhttp-2.7.5.jar
okhttp-3.8.1.jar
okio-1.13.0.jar
opencsv-2.3.jar
-orc-core-1.5.2-nohive.jar
-orc-mapreduce-1.5.2-nohive.jar
-orc-shims-1.5.2.jar
+orc-core-1.5.3-nohive.jar
+orc-mapreduce-1.5.3-nohive.jar
+orc-shims-1.5.3.jar
oro-2.0.8.jar
osgi-resource-locator-1.0.1.jar
paranamer-2.8.jar
diff --git a/pom.xml b/pom.xml
index cc20c5cbf8887..956c35b13cb4f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -131,7 +131,7 @@
1.2.1
10.12.1.1
1.10.0
- 1.5.2
+ 1.5.3
nohive
1.6.0
9.3.24.v20180605
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index b6bb1d7ba4ce3..dc81c0585bf18 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.orc.OrcConf.COMPRESS
import org.apache.orc.OrcFile
+import org.apache.orc.OrcProto.ColumnEncoding.Kind.{DICTIONARY_V2, DIRECT, DIRECT_V2}
import org.apache.orc.OrcProto.Stream.Kind
import org.apache.orc.impl.RecordReaderImpl
import org.scalatest.BeforeAndAfterAll
@@ -115,6 +116,76 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
}
}
+ protected def testSelectiveDictionaryEncoding(isSelective: Boolean) {
+ val tableName = "orcTable"
+
+ withTempDir { dir =>
+ withTable(tableName) {
+ val sqlStatement = orcImp match {
+ case "native" =>
+ s"""
+ |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE)
+ |USING ORC
+ |OPTIONS (
+ | path '${dir.toURI}',
+ | orc.dictionary.key.threshold '1.0',
+ | orc.column.encoding.direct 'uniqColumn'
+ |)
+ """.stripMargin
+ case "hive" =>
+ s"""
+ |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE)
+ |STORED AS ORC
+ |LOCATION '${dir.toURI}'
+ |TBLPROPERTIES (
+ | orc.dictionary.key.threshold '1.0',
+ | hive.exec.orc.dictionary.key.size.threshold '1.0',
+ | orc.column.encoding.direct 'uniqColumn'
+ |)
+ """.stripMargin
+ case impl =>
+ throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl")
+ }
+
+ sql(sqlStatement)
+ sql(s"INSERT INTO $tableName VALUES ('94086', 'random-uuid-string', 0.0)")
+
+ val partFiles = dir.listFiles()
+ .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_"))
+ assert(partFiles.length === 1)
+
+ val orcFilePath = new Path(partFiles.head.getAbsolutePath)
+ val readerOptions = OrcFile.readerOptions(new Configuration())
+ val reader = OrcFile.createReader(orcFilePath, readerOptions)
+ var recordReader: RecordReaderImpl = null
+ try {
+ recordReader = reader.rows.asInstanceOf[RecordReaderImpl]
+
+ // Check the kind
+ val stripe = recordReader.readStripeFooter(reader.getStripes.get(0))
+
+ // The encodings are divided into direct or dictionary-based categories and
+ // further refined as to whether they use RLE v1 or v2. RLE v1 is used by
+ // Hive 0.11 and RLE v2 is introduced in Hive 0.12 ORC with more improvements.
+ // For more details, see https://orc.apache.org/specification/
+ assert(stripe.getColumns(1).getKind === DICTIONARY_V2)
+ if (isSelective) {
+ assert(stripe.getColumns(2).getKind === DIRECT_V2)
+ } else {
+ assert(stripe.getColumns(2).getKind === DICTIONARY_V2)
+ }
+ // Floating point types are stored with DIRECT encoding in IEEE 754 floating
+ // point bit layout.
+ assert(stripe.getColumns(3).getKind === DIRECT)
+ } finally {
+ if (recordReader != null) {
+ recordReader.close()
+ }
+ }
+ }
+ }
+ }
+
test("create temporary orc table") {
checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10))
@@ -284,4 +355,8 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext {
test("Check BloomFilter creation") {
testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101
}
+
+ test("Enforce direct encoding column-wise selectively") {
+ testSelectiveDictionaryEncoding(isSelective = true)
+ }
}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
index c1ae2f6861cb8..7fefaf53939bd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
@@ -182,4 +182,12 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
}
}
}
+
+ test("Enforce direct encoding column-wise selectively") {
+ Seq(true, false).foreach { convertMetastore =>
+ withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") {
+ testSelectiveDictionaryEncoding(isSelective = false)
+ }
+ }
+ }
}