-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-40819][SQL] Timestamp nanos behaviour regression #38312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
96b8f6a
ccdb1cc
d81bbc7
efdf791
e650312
896694e
d610c6c
dd476dc
14d13e6
2713d75
9f0cc5d
83391ab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,24 +49,28 @@ import org.apache.spark.sql.types._ | |
| * @param caseSensitive Whether use case sensitive analysis when comparing Spark catalyst read | ||
| * schema with Parquet schema. | ||
| * @param inferTimestampNTZ Whether TimestampNTZType type is enabled. | ||
| * @param nanosAsLong Whether timestamps with nanos are converted to long. | ||
| */ | ||
| class ParquetToSparkSchemaConverter( | ||
| assumeBinaryIsString: Boolean = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get, | ||
| assumeInt96IsTimestamp: Boolean = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get, | ||
| caseSensitive: Boolean = SQLConf.CASE_SENSITIVE.defaultValue.get, | ||
| inferTimestampNTZ: Boolean = SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.defaultValue.get) { | ||
| inferTimestampNTZ: Boolean = SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.defaultValue.get, | ||
| nanosAsLong: Boolean = SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.defaultValue.get) { | ||
|
|
||
| def this(conf: SQLConf) = this( | ||
| assumeBinaryIsString = conf.isParquetBinaryAsString, | ||
| assumeInt96IsTimestamp = conf.isParquetINT96AsTimestamp, | ||
| caseSensitive = conf.caseSensitiveAnalysis, | ||
| inferTimestampNTZ = conf.parquetInferTimestampNTZEnabled) | ||
| inferTimestampNTZ = conf.parquetInferTimestampNTZEnabled, | ||
| nanosAsLong = conf.legacyParquetNanosAsLong) | ||
|
|
||
| def this(conf: Configuration) = this( | ||
| assumeBinaryIsString = conf.get(SQLConf.PARQUET_BINARY_AS_STRING.key).toBoolean, | ||
| assumeInt96IsTimestamp = conf.get(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key).toBoolean, | ||
| caseSensitive = conf.get(SQLConf.CASE_SENSITIVE.key).toBoolean, | ||
| inferTimestampNTZ = conf.get(SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.key).toBoolean) | ||
| inferTimestampNTZ = conf.get(SQLConf.PARQUET_INFER_TIMESTAMP_NTZ_ENABLED.key).toBoolean, | ||
| nanosAsLong = conf.get(SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key).toBoolean) | ||
|
|
||
| /** | ||
| * Returns true if TIMESTAMP_NTZ type is enabled in this ParquetToSparkSchemaConverter. | ||
|
|
@@ -271,6 +275,11 @@ class ParquetToSparkSchemaConverter( | |
| } else { | ||
| TimestampNTZType | ||
| } | ||
| // SPARK-40819: NANOS are not supported as a Timestamp, convert to LongType without | ||
| // timezone awareness to address behaviour regression introduced by SPARK-34661 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we do a truncation and still read it as timestamp type?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not think this is a good idea as the precision will be lost which is extremely important for high frequency time series. I haven’t verified but end users/developers would still be able to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the mere purpose of this exercise is to get access to the nano precision. |
||
| case timestamp: TimestampLogicalTypeAnnotation | ||
| if timestamp.getUnit == TimeUnit.NANOS && nanosAsLong => | ||
| LongType | ||
| case _ => illegalType() | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ import org.apache.parquet.schema.Type._ | |
| import org.apache.spark.SparkException | ||
| import org.apache.spark.sql.catalyst.ScalaReflection | ||
| import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException | ||
| import org.apache.spark.sql.functions.desc | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType._ | ||
| import org.apache.spark.sql.test.SharedSparkSession | ||
|
|
@@ -45,15 +46,17 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSparkSession { | |
| binaryAsString: Boolean, | ||
| int96AsTimestamp: Boolean, | ||
| writeLegacyParquetFormat: Boolean, | ||
| expectedParquetColumn: Option[ParquetColumn] = None): Unit = { | ||
| expectedParquetColumn: Option[ParquetColumn] = None, | ||
| nanosAsLong: Boolean = false): Unit = { | ||
| testSchema( | ||
| testName, | ||
| StructType.fromAttributes(ScalaReflection.attributesFor[T]), | ||
| messageType, | ||
| binaryAsString, | ||
| int96AsTimestamp, | ||
| writeLegacyParquetFormat, | ||
| expectedParquetColumn = expectedParquetColumn) | ||
| expectedParquetColumn = expectedParquetColumn, | ||
| nanosAsLong = nanosAsLong) | ||
| } | ||
|
|
||
| protected def testParquetToCatalyst( | ||
|
|
@@ -65,12 +68,14 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSparkSession { | |
| caseSensitive: Boolean = false, | ||
| inferTimestampNTZ: Boolean = true, | ||
| sparkReadSchema: Option[StructType] = None, | ||
| expectedParquetColumn: Option[ParquetColumn] = None): Unit = { | ||
| expectedParquetColumn: Option[ParquetColumn] = None, | ||
| nanosAsLong: Boolean = false): Unit = { | ||
| val converter = new ParquetToSparkSchemaConverter( | ||
| assumeBinaryIsString = binaryAsString, | ||
| assumeInt96IsTimestamp = int96AsTimestamp, | ||
| caseSensitive = caseSensitive, | ||
| inferTimestampNTZ = inferTimestampNTZ) | ||
| inferTimestampNTZ = inferTimestampNTZ, | ||
| nanosAsLong = nanosAsLong) | ||
|
|
||
| test(s"sql <= parquet: $testName") { | ||
| val actualParquetColumn = converter.convertParquetColumn( | ||
|
|
@@ -119,7 +124,8 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSparkSession { | |
| writeLegacyParquetFormat: Boolean, | ||
| outputTimestampType: SQLConf.ParquetOutputTimestampType.Value = | ||
| SQLConf.ParquetOutputTimestampType.INT96, | ||
| expectedParquetColumn: Option[ParquetColumn] = None): Unit = { | ||
| expectedParquetColumn: Option[ParquetColumn] = None, | ||
| nanosAsLong: Boolean = false): Unit = { | ||
|
|
||
| testCatalystToParquet( | ||
| testName, | ||
|
|
@@ -134,7 +140,8 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSparkSession { | |
| parquetSchema, | ||
| binaryAsString, | ||
| int96AsTimestamp, | ||
| expectedParquetColumn = expectedParquetColumn) | ||
| expectedParquetColumn = expectedParquetColumn, | ||
| nanosAsLong = nanosAsLong) | ||
| } | ||
|
|
||
| protected def compareParquetColumn(actual: ParquetColumn, expected: ParquetColumn): Unit = { | ||
|
|
@@ -149,7 +156,14 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSparkSession { | |
| val expectedDesc = expected.descriptor.get | ||
| assert(actualDesc.getMaxRepetitionLevel == expectedDesc.getMaxRepetitionLevel) | ||
| assert(actualDesc.getMaxRepetitionLevel == expectedDesc.getMaxRepetitionLevel) | ||
| assert(actualDesc.getPrimitiveType === expectedDesc.getPrimitiveType) | ||
|
|
||
| actualDesc.getPrimitiveType.getLogicalTypeAnnotation match { | ||
| case timestamp: LogicalTypeAnnotation.TimestampLogicalTypeAnnotation | ||
| if timestamp.getUnit == LogicalTypeAnnotation.TimeUnit.NANOS => | ||
| assert(actual.sparkType == expected.sparkType) | ||
| case _ => | ||
| assert(actualDesc.getPrimitiveType === expectedDesc.getPrimitiveType) | ||
| } | ||
| } | ||
|
|
||
| assert(actual.repetitionLevel == expected.repetitionLevel, "repetition level mismatch: " + | ||
|
|
@@ -197,6 +211,32 @@ abstract class ParquetSchemaTest extends ParquetTest with SharedSparkSession { | |
| } | ||
|
|
||
| class ParquetSchemaInferenceSuite extends ParquetSchemaTest { | ||
| testSchemaInference[Tuple1[Long]]( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this case run passed before Spark 3.2, in my impression, Parquet 1.10.1 used by Spark 3.1 does not support nanos type, does it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This particular case doesn’t pass, neither does similar tests for
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then is this really a regression?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I've been looking further into it, it's because the message is different between In where as So in Spark 3.1.0 you end up hitting this block with returns a where as since 3.2 you hit https://github.com/apache/spark/blob/branch-3.2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala#L174 because a case for
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan moving Parquet from |
||
| "timestamp nanos", | ||
| """ | ||
| |message root { | ||
| | required int64 _1 (TIMESTAMP(NANOS,true)); | ||
| |} | ||
| """.stripMargin, | ||
| binaryAsString = false, | ||
| int96AsTimestamp = true, | ||
| writeLegacyParquetFormat = true, | ||
| expectedParquetColumn = Some( | ||
| ParquetColumn( | ||
| sparkType = StructType.fromAttributes( | ||
| ScalaReflection.attributesFor[Tuple1[Long]]), | ||
| descriptor = None, | ||
| repetitionLevel = 0, | ||
| definitionLevel = 0, | ||
| required = false, | ||
| path = Seq(), | ||
| children = Seq( | ||
| primitiveParquetColumn(LongType, PrimitiveTypeName.INT64, Repetition.REQUIRED, | ||
| 0, 0, Seq("_1"), logicalTypeAnnotation = Some(LogicalTypeAnnotation.intType(64, false))) | ||
| ))), | ||
| nanosAsLong = true | ||
| ) | ||
|
|
||
| testSchemaInference[(Boolean, Int, Long, Float, Double, Array[Byte])]( | ||
| "basic types", | ||
| """ | ||
|
|
@@ -1027,6 +1067,24 @@ class ParquetSchemaSuite extends ParquetSchemaTest { | |
| } | ||
| } | ||
|
|
||
| test("SPARK-40819: parquet file with TIMESTAMP(NANOS, true) (with nanosAsLong=true)") { | ||
| val tsAttribute = "birthday" | ||
| withSQLConf(SQLConf.LEGACY_PARQUET_NANOS_AS_LONG.key -> "true") { | ||
| val testDataPath = testFile("test-data/timestamp-nanos.parquet") | ||
| val data = spark.read.parquet(testDataPath).select(tsAttribute) | ||
| assert(data.schema.fields.head.dataType == LongType) | ||
| assert(data.orderBy(desc(tsAttribute)).take(1).head.getAs[Long](0) == 1668537129123534758L) | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-40819: parquet file with TIMESTAMP(NANOS, true) (with default nanosAsLong=false)") { | ||
| val testDataPath = testFile("test-data/timestamp-nanos.parquet") | ||
| val e = intercept[org.apache.spark.SparkException] { | ||
| spark.read.parquet(testDataPath).collect() | ||
| } | ||
| assert(e.getMessage.contains("Illegal Parquet type: INT64 (TIMESTAMP(NANOS,true)).")) | ||
| } | ||
|
|
||
| // ======================================================= | ||
| // Tests for converting Parquet LIST to Catalyst ArrayType | ||
| // ======================================================= | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@awdavidson I realised that we already released Spark 3.2.3.
Can you make a PR to fix this to 3.2.4?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure