Skip to content

Commit 84f12da

Browse files
dongjoon-hyunJackey Lee
authored andcommitted
[SPARK-25635][SQL][BUILD] Support selective direct encoding in native ORC write
## What changes were proposed in this pull request? Before ORC 1.5.3, `orc.dictionary.key.threshold` and `hive.exec.orc.dictionary.key.size.threshold` are applied for all columns. This has been a big huddle to enable dictionary encoding. From ORC 1.5.3, `orc.column.encoding.direct` is added to enforce direct encoding selectively in a column-wise manner. This PR aims to add that feature by upgrading ORC from 1.5.2 to 1.5.3. The followings are the patches in ORC 1.5.3 and this feature is the only one related to Spark directly. ``` ORC-406: ORC: Char(n) and Varchar(n) writers truncate to n bytes & corrupts multi-byte data (gopalv) ORC-403: [C++] Add checks to avoid invalid offsets in InputStream ORC-405: Remove calcite as a dependency from the benchmarks. ORC-375: Fix libhdfs on gcc7 by adding #include <functional> two places. ORC-383: Parallel builds fails with ConcurrentModificationException ORC-382: Apache rat exclusions + add rat check to travis ORC-401: Fix incorrect quoting in specification. ORC-385: Change RecordReader to extend Closeable. ORC-384: [C++] fix memory leak when loading non-ORC files ORC-391: [c++] parseType does not accept underscore in the field name ORC-397: Allow selective disabling of dictionary encoding. Original patch was by Mithun Radhakrishnan. ORC-389: Add ability to not decode Acid metadata columns ``` ## How was this patch tested? Pass the Jenkins with newly added test cases. Closes apache#22622 from dongjoon-hyun/SPARK-25635. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: gatorsmile <gatorsmile@gmail.com>
1 parent dba1fa1 commit 84f12da

File tree

6 files changed

+93
-10
lines changed

6 files changed

+93
-10
lines changed

dev/deps/spark-deps-hadoop-2.6

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,9 @@ objenesis-2.5.1.jar
153153
okhttp-3.8.1.jar
154154
okio-1.13.0.jar
155155
opencsv-2.3.jar
156-
orc-core-1.5.2-nohive.jar
157-
orc-mapreduce-1.5.2-nohive.jar
158-
orc-shims-1.5.2.jar
156+
orc-core-1.5.3-nohive.jar
157+
orc-mapreduce-1.5.3-nohive.jar
158+
orc-shims-1.5.3.jar
159159
oro-2.0.8.jar
160160
osgi-resource-locator-1.0.1.jar
161161
paranamer-2.8.jar

dev/deps/spark-deps-hadoop-2.7

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,9 @@ objenesis-2.5.1.jar
154154
okhttp-3.8.1.jar
155155
okio-1.13.0.jar
156156
opencsv-2.3.jar
157-
orc-core-1.5.2-nohive.jar
158-
orc-mapreduce-1.5.2-nohive.jar
159-
orc-shims-1.5.2.jar
157+
orc-core-1.5.3-nohive.jar
158+
orc-mapreduce-1.5.3-nohive.jar
159+
orc-shims-1.5.3.jar
160160
oro-2.0.8.jar
161161
osgi-resource-locator-1.0.1.jar
162162
paranamer-2.8.jar

dev/deps/spark-deps-hadoop-3.1

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,9 +172,9 @@ okhttp-2.7.5.jar
172172
okhttp-3.8.1.jar
173173
okio-1.13.0.jar
174174
opencsv-2.3.jar
175-
orc-core-1.5.2-nohive.jar
176-
orc-mapreduce-1.5.2-nohive.jar
177-
orc-shims-1.5.2.jar
175+
orc-core-1.5.3-nohive.jar
176+
orc-mapreduce-1.5.3-nohive.jar
177+
orc-shims-1.5.3.jar
178178
oro-2.0.8.jar
179179
osgi-resource-locator-1.0.1.jar
180180
paranamer-2.8.jar

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@
131131
<hive.version.short>1.2.1</hive.version.short>
132132
<derby.version>10.12.1.1</derby.version>
133133
<parquet.version>1.10.0</parquet.version>
134-
<orc.version>1.5.2</orc.version>
134+
<orc.version>1.5.3</orc.version>
135135
<orc.classifier>nohive</orc.classifier>
136136
<hive.parquet.version>1.6.0</hive.parquet.version>
137137
<jetty.version>9.3.24.v20180605</jetty.version>

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration
2525
import org.apache.hadoop.fs.Path
2626
import org.apache.orc.OrcConf.COMPRESS
2727
import org.apache.orc.OrcFile
28+
import org.apache.orc.OrcProto.ColumnEncoding.Kind.{DICTIONARY_V2, DIRECT, DIRECT_V2}
2829
import org.apache.orc.OrcProto.Stream.Kind
2930
import org.apache.orc.impl.RecordReaderImpl
3031
import org.scalatest.BeforeAndAfterAll
@@ -115,6 +116,76 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
115116
}
116117
}
117118

119+
protected def testSelectiveDictionaryEncoding(isSelective: Boolean) {
120+
val tableName = "orcTable"
121+
122+
withTempDir { dir =>
123+
withTable(tableName) {
124+
val sqlStatement = orcImp match {
125+
case "native" =>
126+
s"""
127+
|CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE)
128+
|USING ORC
129+
|OPTIONS (
130+
| path '${dir.toURI}',
131+
| orc.dictionary.key.threshold '1.0',
132+
| orc.column.encoding.direct 'uniqColumn'
133+
|)
134+
""".stripMargin
135+
case "hive" =>
136+
s"""
137+
|CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE)
138+
|STORED AS ORC
139+
|LOCATION '${dir.toURI}'
140+
|TBLPROPERTIES (
141+
| orc.dictionary.key.threshold '1.0',
142+
| hive.exec.orc.dictionary.key.size.threshold '1.0',
143+
| orc.column.encoding.direct 'uniqColumn'
144+
|)
145+
""".stripMargin
146+
case impl =>
147+
throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl")
148+
}
149+
150+
sql(sqlStatement)
151+
sql(s"INSERT INTO $tableName VALUES ('94086', 'random-uuid-string', 0.0)")
152+
153+
val partFiles = dir.listFiles()
154+
.filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_"))
155+
assert(partFiles.length === 1)
156+
157+
val orcFilePath = new Path(partFiles.head.getAbsolutePath)
158+
val readerOptions = OrcFile.readerOptions(new Configuration())
159+
val reader = OrcFile.createReader(orcFilePath, readerOptions)
160+
var recordReader: RecordReaderImpl = null
161+
try {
162+
recordReader = reader.rows.asInstanceOf[RecordReaderImpl]
163+
164+
// Check the kind
165+
val stripe = recordReader.readStripeFooter(reader.getStripes.get(0))
166+
167+
// The encodings are divided into direct or dictionary-based categories and
168+
// further refined as to whether they use RLE v1 or v2. RLE v1 is used by
169+
// Hive 0.11 and RLE v2 is introduced in Hive 0.12 ORC with more improvements.
170+
// For more details, see https://orc.apache.org/specification/
171+
assert(stripe.getColumns(1).getKind === DICTIONARY_V2)
172+
if (isSelective) {
173+
assert(stripe.getColumns(2).getKind === DIRECT_V2)
174+
} else {
175+
assert(stripe.getColumns(2).getKind === DICTIONARY_V2)
176+
}
177+
// Floating point types are stored with DIRECT encoding in IEEE 754 floating
178+
// point bit layout.
179+
assert(stripe.getColumns(3).getKind === DIRECT)
180+
} finally {
181+
if (recordReader != null) {
182+
recordReader.close()
183+
}
184+
}
185+
}
186+
}
187+
}
188+
118189
test("create temporary orc table") {
119190
checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10))
120191

@@ -284,4 +355,8 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext {
284355
test("Check BloomFilter creation") {
285356
testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101
286357
}
358+
359+
test("Enforce direct encoding column-wise selectively") {
360+
testSelectiveDictionaryEncoding(isSelective = true)
361+
}
287362
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,4 +182,12 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
182182
}
183183
}
184184
}
185+
186+
test("Enforce direct encoding column-wise selectively") {
187+
Seq(true, false).foreach { convertMetastore =>
188+
withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") {
189+
testSelectiveDictionaryEncoding(isSelective = false)
190+
}
191+
}
192+
}
185193
}

0 commit comments

Comments
 (0)