-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25635][SQL][BUILD] Support selective direct encoding in native ORC write #22622
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration | |
| import org.apache.hadoop.fs.Path | ||
| import org.apache.orc.OrcConf.COMPRESS | ||
| import org.apache.orc.OrcFile | ||
| import org.apache.orc.OrcProto.ColumnEncoding.Kind.{DICTIONARY_V2, DIRECT, DIRECT_V2} | ||
| import org.apache.orc.OrcProto.Stream.Kind | ||
| import org.apache.orc.impl.RecordReaderImpl | ||
| import org.scalatest.BeforeAndAfterAll | ||
|
|
@@ -115,6 +116,76 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { | |
| } | ||
| } | ||
|
|
||
| protected def testSelectiveDictionaryEncoding(isSelective: Boolean) { | ||
| val tableName = "orcTable" | ||
|
|
||
| withTempDir { dir => | ||
| withTable(tableName) { | ||
| val sqlStatement = orcImp match { | ||
| case "native" => | ||
| s""" | ||
| |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE) | ||
| |USING ORC | ||
| |OPTIONS ( | ||
| | path '${dir.toURI}', | ||
| | orc.dictionary.key.threshold '1.0', | ||
| | orc.column.encoding.direct 'uniqColumn' | ||
| |) | ||
| """.stripMargin | ||
| case "hive" => | ||
| s""" | ||
| |CREATE TABLE $tableName (zipcode STRING, uniqColumn STRING, value DOUBLE) | ||
| |STORED AS ORC | ||
| |LOCATION '${dir.toURI}' | ||
| |TBLPROPERTIES ( | ||
| | orc.dictionary.key.threshold '1.0', | ||
| | hive.exec.orc.dictionary.key.size.threshold '1.0', | ||
| | orc.column.encoding.direct 'uniqColumn' | ||
| |) | ||
| """.stripMargin | ||
| case impl => | ||
| throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl") | ||
| } | ||
|
|
||
| sql(sqlStatement) | ||
| sql(s"INSERT INTO $tableName VALUES ('94086', 'random-uuid-string', 0.0)") | ||
|
|
||
| val partFiles = dir.listFiles() | ||
| .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) | ||
| assert(partFiles.length === 1) | ||
|
|
||
| val orcFilePath = new Path(partFiles.head.getAbsolutePath) | ||
| val readerOptions = OrcFile.readerOptions(new Configuration()) | ||
| val reader = OrcFile.createReader(orcFilePath, readerOptions) | ||
| var recordReader: RecordReaderImpl = null | ||
| try { | ||
| recordReader = reader.rows.asInstanceOf[RecordReaderImpl] | ||
|
|
||
| // Check the kind | ||
| val stripe = recordReader.readStripeFooter(reader.getStripes.get(0)) | ||
|
|
||
| // The encodings are divided into direct or dictionary-based categories and | ||
| // further refined as to whether they use RLE v1 or v2. RLE v1 is used by | ||
| // Hive 0.11 and RLE v2 is introduced in Hive 0.12 ORC with more improvements. | ||
| // For more details, see https://orc.apache.org/specification/ | ||
| assert(stripe.getColumns(1).getKind === DICTIONARY_V2) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you write some comments to explain what
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure! |
||
| if (isSelective) { | ||
| assert(stripe.getColumns(2).getKind === DIRECT_V2) | ||
| } else { | ||
| assert(stripe.getColumns(2).getKind === DICTIONARY_V2) | ||
| } | ||
| // Floating point types are stored with DIRECT encoding in IEEE 754 floating | ||
| // point bit layout. | ||
| assert(stripe.getColumns(3).getKind === DIRECT) | ||
| } finally { | ||
| if (recordReader != null) { | ||
| recordReader.close() | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("create temporary orc table") { | ||
| checkAnswer(sql("SELECT COUNT(*) FROM normal_orc_source"), Row(10)) | ||
|
|
||
|
|
@@ -284,4 +355,8 @@ class OrcSourceSuite extends OrcSuite with SharedSQLContext { | |
| test("Check BloomFilter creation") { | ||
| testBloomFilterCreation(Kind.BLOOM_FILTER_UTF8) // After ORC-101 | ||
| } | ||
|
|
||
| test("Enforce direct encoding column-wise selectively") { | ||
| testSelectiveDictionaryEncoding(isSelective = true) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -182,4 +182,12 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton { | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("Enforce direct encoding column-wise selectively") { | ||
| Seq(true, false).foreach { convertMetastore => | ||
| withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") { | ||
| testSelectiveDictionaryEncoding(isSelective = false) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So even with
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep. This is based on the current behavior which is a little related to your CTAS PR. Only read-path works as expected.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we change Spark behavior later, this test will be adapted according to it.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. I see. Thanks. |
||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This new feature needs a doc update. We need to let our end users how to use it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ur, Apache ORC is an independent Apache project which has its own website and documents. We should respect that. If we introduce new ORC configuration one by one in Apache Spark website, it will eventually duplicate Apache ORC document in Apache Spark document.
We had better guide ORC fans to Apache ORC website. If something is missing there, they can file an ORC JIRA, not SPARK JIRA.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am fine either way. However, our current doc does not explain we are passing the data source specific options to the underlying data source:
https://spark.apache.org/docs/latest/sql-programming-guide.html#manually-specifying-options
Could you help improve it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also give an example?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That sounds like a different issue. This PR covers both
TBLPROPERTIESandOPTIONSsyntaxes where are designed for that configuration-purpose historically. I mean this is not about data-source specific PR. Also, the scope of this PR is only write-side configurations.In any way, +1 for adding some introduction section for both Parquet/ORC examples there. We had better give both read/write side configuration examples, too. Could you file a JIRA issue for that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe, dictionary encoding could be a good candidate;
parquet.enable.dictionaryandorc.dictionary.key.thresholdet al.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://issues.apache.org/jira/browse/SPARK-25656 is created for that.