@@ -41,34 +41,31 @@ import org.apache.spark.sql.{AnalysisException, SQLConf}
4141 * @constructor
4242 * @param assumeBinaryIsString Whether unannotated BINARY fields should be assumed to be Spark SQL
4343 * [[StringType ]] fields when converting Parquet a [[MessageType ]] to Spark SQL
44- * [[StructType ]].
44+ * [[StructType ]]. This argument only affects Parquet read path.
4545 * @param assumeInt96IsTimestamp Whether unannotated INT96 fields should be assumed to be Spark SQL
4646 * [[TimestampType ]] fields when converting Parquet a [[MessageType ]] to Spark SQL
4747 * [[StructType ]]. Note that Spark SQL [[TimestampType ]] is similar to Hive timestamp, which
4848 * has optional nanosecond precision, but different from `TIME_MILLS` and `TIMESTAMP_MILLIS`
49- * described in Parquet format spec.
50- * @param followParquetFormatSpec Whether to generate standard DECIMAL, LIST, and MAP structure when
51- * converting Spark SQL [[StructType ]] to Parquet [[MessageType ]]. For Spark 1.4.x and
52- * prior versions, Spark SQL only supports decimals with a max precision of 18 digits, and
53- * uses non-standard LIST and MAP structure. Note that the current Parquet format spec is
54- * backwards-compatible with these settings. If this argument is set to `false`, we fallback
55- * to old style non-standard behaviors.
49+ * described in Parquet format spec. This argument only affects Parquet read path.
50+ * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4
51+ * and prior versions when converting a Catalyst [[StructType ]] to a Parquet [[MessageType ]].
52+ * When set to false, use standard format defined in parquet-format spec. This argument only
53+ * affects Parquet write path.
5654 */
5755private [parquet] class CatalystSchemaConverter (
5856 assumeBinaryIsString : Boolean = SQLConf .PARQUET_BINARY_AS_STRING .defaultValue.get,
5957 assumeInt96IsTimestamp : Boolean = SQLConf .PARQUET_INT96_AS_TIMESTAMP .defaultValue.get,
60- followParquetFormatSpec : Boolean = SQLConf .PARQUET_FOLLOW_PARQUET_FORMAT_SPEC .defaultValue.get
61- ) {
58+ writeLegacyParquetFormat : Boolean = SQLConf .PARQUET_WRITE_LEGACY_FORMAT .defaultValue.get) {
6259
6360 def this (conf : SQLConf ) = this (
6461 assumeBinaryIsString = conf.isParquetBinaryAsString,
6562 assumeInt96IsTimestamp = conf.isParquetINT96AsTimestamp,
66- followParquetFormatSpec = conf.followParquetFormatSpec )
63+ writeLegacyParquetFormat = conf.writeLegacyParquetFormat )
6764
6865 def this (conf : Configuration ) = this (
6966 assumeBinaryIsString = conf.get(SQLConf .PARQUET_BINARY_AS_STRING .key).toBoolean,
7067 assumeInt96IsTimestamp = conf.get(SQLConf .PARQUET_INT96_AS_TIMESTAMP .key).toBoolean,
71- followParquetFormatSpec = conf.get(SQLConf .PARQUET_FOLLOW_PARQUET_FORMAT_SPEC .key).toBoolean)
68+ writeLegacyParquetFormat = conf.get(SQLConf .PARQUET_WRITE_LEGACY_FORMAT .key).toBoolean)
7269
7370 /**
7471 * Converts Parquet [[MessageType ]] `parquetSchema` to a Spark SQL [[StructType ]].
@@ -371,15 +368,15 @@ private[parquet] class CatalystSchemaConverter(
371368 case BinaryType =>
372369 Types .primitive(BINARY , repetition).named(field.name)
373370
374- // =====================================
375- // Decimals (for Spark version <= 1.4.x )
376- // =====================================
371+ // ======================
372+ // Decimals (legacy mode )
373+ // ======================
377374
378375 // Spark 1.4.x and prior versions only support decimals with a maximum precision of 18 and
379376 // always store decimals in fixed-length byte arrays. To keep compatibility with these older
380377 // versions, here we convert decimals with all precisions to `FIXED_LEN_BYTE_ARRAY` annotated
381378 // by `DECIMAL`.
382- case DecimalType .Fixed (precision, scale) if ! followParquetFormatSpec =>
379+ case DecimalType .Fixed (precision, scale) if writeLegacyParquetFormat =>
383380 Types
384381 .primitive(FIXED_LEN_BYTE_ARRAY , repetition)
385382 .as(DECIMAL )
@@ -388,13 +385,13 @@ private[parquet] class CatalystSchemaConverter(
388385 .length(CatalystSchemaConverter .minBytesForPrecision(precision))
389386 .named(field.name)
390387
391- // =====================================
392- // Decimals (follow Parquet format spec )
393- // =====================================
388+ // ========================
389+ // Decimals (standard mode )
390+ // ========================
394391
395392 // Uses INT32 for 1 <= precision <= 9
396393 case DecimalType .Fixed (precision, scale)
397- if precision <= MAX_PRECISION_FOR_INT32 && followParquetFormatSpec =>
394+ if precision <= MAX_PRECISION_FOR_INT32 && ! writeLegacyParquetFormat =>
398395 Types
399396 .primitive(INT32 , repetition)
400397 .as(DECIMAL )
@@ -404,7 +401,7 @@ private[parquet] class CatalystSchemaConverter(
404401
405402 // Uses INT64 for 1 <= precision <= 18
406403 case DecimalType .Fixed (precision, scale)
407- if precision <= MAX_PRECISION_FOR_INT64 && followParquetFormatSpec =>
404+ if precision <= MAX_PRECISION_FOR_INT64 && ! writeLegacyParquetFormat =>
408405 Types
409406 .primitive(INT64 , repetition)
410407 .as(DECIMAL )
@@ -413,7 +410,7 @@ private[parquet] class CatalystSchemaConverter(
413410 .named(field.name)
414411
415412 // Uses FIXED_LEN_BYTE_ARRAY for all other precisions
416- case DecimalType .Fixed (precision, scale) if followParquetFormatSpec =>
413+ case DecimalType .Fixed (precision, scale) if ! writeLegacyParquetFormat =>
417414 Types
418415 .primitive(FIXED_LEN_BYTE_ARRAY , repetition)
419416 .as(DECIMAL )
@@ -422,15 +419,15 @@ private[parquet] class CatalystSchemaConverter(
422419 .length(CatalystSchemaConverter .minBytesForPrecision(precision))
423420 .named(field.name)
424421
425- // ===================================================
426- // ArrayType and MapType (for Spark versions <= 1.4.x )
427- // ===================================================
422+ // ===================================
423+ // ArrayType and MapType (legacy mode )
424+ // ===================================
428425
429426 // Spark 1.4.x and prior versions convert `ArrayType` with nullable elements into a 3-level
430427 // `LIST` structure. This behavior is somewhat a hybrid of parquet-hive and parquet-avro
431428 // (1.6.0rc3): the 3-level structure is similar to parquet-hive while the 3rd level element
432429 // field name "array" is borrowed from parquet-avro.
433- case ArrayType (elementType, nullable @ true ) if ! followParquetFormatSpec =>
430+ case ArrayType (elementType, nullable @ true ) if writeLegacyParquetFormat =>
434431 // <list-repetition> group <name> (LIST) {
435432 // optional group bag {
436433 // repeated <element-type> array;
@@ -448,7 +445,7 @@ private[parquet] class CatalystSchemaConverter(
448445 // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level
449446 // LIST structure. This behavior mimics parquet-avro (1.6.0rc3). Note that this case is
450447 // covered by the backwards-compatibility rules implemented in `isElementType()`.
451- case ArrayType (elementType, nullable @ false ) if ! followParquetFormatSpec =>
448+ case ArrayType (elementType, nullable @ false ) if writeLegacyParquetFormat =>
452449 // <list-repetition> group <name> (LIST) {
453450 // repeated <element-type> element;
454451 // }
@@ -460,7 +457,7 @@ private[parquet] class CatalystSchemaConverter(
460457
461458 // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by
462459 // MAP_KEY_VALUE. This is covered by `convertGroupField(field: GroupType): DataType`.
463- case MapType (keyType, valueType, valueContainsNull) if ! followParquetFormatSpec =>
460+ case MapType (keyType, valueType, valueContainsNull) if writeLegacyParquetFormat =>
464461 // <map-repetition> group <name> (MAP) {
465462 // repeated group map (MAP_KEY_VALUE) {
466463 // required <key-type> key;
@@ -473,11 +470,11 @@ private[parquet] class CatalystSchemaConverter(
473470 convertField(StructField (" key" , keyType, nullable = false )),
474471 convertField(StructField (" value" , valueType, valueContainsNull)))
475472
476- // ==================================================
477- // ArrayType and MapType (follow Parquet format spec )
478- // ==================================================
473+ // =====================================
474+ // ArrayType and MapType (standard mode )
475+ // =====================================
479476
480- case ArrayType (elementType, containsNull) if followParquetFormatSpec =>
477+ case ArrayType (elementType, containsNull) if ! writeLegacyParquetFormat =>
481478 // <list-repetition> group <name> (LIST) {
482479 // repeated group list {
483480 // <element-repetition> <element-type> element;
0 commit comments