-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-30301][SQL] Fix wrong results when datetimes as fields of complex types #26942
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
87fc4ab
7e1c437
5b8bec3
bb77e3c
1d4a9bf
e4bcc8c
32383e6
8310be0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,78 +56,41 @@ object HiveResult { | |
| // We need the types so we can output struct field names | ||
| val types = executedPlan.output.map(_.dataType) | ||
| // Reformat to match hive tab delimited output. | ||
| result.map(_.zip(types).map(toHiveString)).map(_.mkString("\t")) | ||
| result.map(_.zip(types).map(e => toHiveString(e))) | ||
| .map(_.mkString("\t")) | ||
| } | ||
|
|
||
| private val primitiveTypes = Seq( | ||
| StringType, | ||
| IntegerType, | ||
| LongType, | ||
| DoubleType, | ||
| FloatType, | ||
| BooleanType, | ||
| ByteType, | ||
| ShortType, | ||
| DateType, | ||
| TimestampType, | ||
| BinaryType) | ||
|
|
||
| private lazy val zoneId = DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone) | ||
| private lazy val dateFormatter = DateFormatter(zoneId) | ||
| private lazy val timestampFormatter = TimestampFormatter.getFractionFormatter(zoneId) | ||
|
|
||
| /** Hive outputs fields of structs slightly differently than top level attributes. */ | ||
| private def toHiveStructString(a: (Any, DataType)): String = a match { | ||
| case (struct: Row, StructType(fields)) => | ||
| struct.toSeq.zip(fields).map { | ||
| case (v, t) => s""""${t.name}":${toHiveStructString((v, t.dataType))}""" | ||
| }.mkString("{", ",", "}") | ||
| case (seq: Seq[_], ArrayType(typ, _)) => | ||
| seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]") | ||
| case (map: Map[_, _], MapType(kType, vType, _)) => | ||
| map.map { | ||
| case (key, value) => | ||
| toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType)) | ||
| }.toSeq.sorted.mkString("{", ",", "}") | ||
| case (null, _) => "null" | ||
| case (s: String, StringType) => "\"" + s + "\"" | ||
| case (decimal, DecimalType()) => decimal.toString | ||
| case (interval: CalendarInterval, CalendarIntervalType) => | ||
| SQLConf.get.intervalOutputStyle match { | ||
| case SQL_STANDARD => toSqlStandardString(interval) | ||
| case ISO_8601 => toIso8601String(interval) | ||
| case MULTI_UNITS => toMultiUnitsString(interval) | ||
| } | ||
| case (other, tpe) if primitiveTypes contains tpe => other.toString | ||
| } | ||
|
|
||
| /** Formats a datum (based on the given data type) and returns the string representation. */ | ||
| def toHiveString(a: (Any, DataType)): String = a match { | ||
| case (struct: Row, StructType(fields)) => | ||
| struct.toSeq.zip(fields).map { | ||
| case (v, t) => s""""${t.name}":${toHiveStructString((v, t.dataType))}""" | ||
| }.mkString("{", ",", "}") | ||
| case (seq: Seq[_], ArrayType(typ, _)) => | ||
| seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]") | ||
| case (map: Map[_, _], MapType(kType, vType, _)) => | ||
| map.map { | ||
| case (key, value) => | ||
| toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType)) | ||
| }.toSeq.sorted.mkString("{", ",", "}") | ||
| case (null, _) => "NULL" | ||
| def toHiveString(a: (Any, DataType), nested: Boolean = false): String = a match { | ||
| case (null, _) => if (nested) "null" else "NULL" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this weird behavior inherited from Hive?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, there are many hive compatibility unit tests there |
||
| case (b, BooleanType) => b.toString | ||
| case (d: Date, DateType) => dateFormatter.format(DateTimeUtils.fromJavaDate(d)) | ||
| case (t: Timestamp, TimestampType) => | ||
| DateTimeUtils.timestampToString(timestampFormatter, DateTimeUtils.fromJavaTimestamp(t)) | ||
| timestampFormatter.format(DateTimeUtils.fromJavaTimestamp(t)) | ||
| case (bin: Array[Byte], BinaryType) => new String(bin, StandardCharsets.UTF_8) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the other hand, if the nested-struct result is to be parsed back e.g. as json, the arbitrary bytes from the String created from binary may mess it up...
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good point. Maybe we can follow The binary will be displayed like
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, but IIRC Hive results string from binary, and this is
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hive> create table bt(k binary);
OK
Time taken: 0.637 seconds
hive> insert into bt values ("spark"), ("hello"), ("3"), ("."), ("0");
Query ID = root_20200325055555_1c227d63-47dd-4899-a879-0b6a98269908
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1585115622286_0001, Tracking URL = http://quickstart.cloudera:8088/proxy/application_1585115622286_0001/
Kill Command = /usr/lib/hadoop/bin/hadoop job -kill job_1585115622286_0001
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2020-03-25 05:57:03,557 Stage-1 map = 0%, reduce = 0%
2020-03-25 05:57:09,766 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.86 sec
MapReduce Total cumulative CPU time: 1 seconds 860 msec
Ended Job = job_1585115622286_0001
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to: hdfs://quickstart.cloudera:8020/user/hive/warehouse/bt/.hive-staging_hive_2020-03-25_05-56-54_007_226069574359185383-1/-ext-10000
Loading data to table default.bt
Table default.bt stats: [numFiles=1, numRows=5, totalSize=33, rawDataSize=28]
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 1.86 sec HDFS Read: 3080 HDFS Write: 99 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 860 msec
OK
Time taken: 17.185 seconds
hive> select k, array(k) from bt;
OK
spark [spark]
hello [hello]
3 [3]
. [.]
0 [0]
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Anything I am missing here?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah i see. Then seems nothing needs to be changed. cc @juliuszsompolski
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 0: jdbc:hive2://localhost:10000> desc bt_arr;
INFO : Compiling command(queryId=hive_20200325062323_10f71e38-fe8d-4094-aa87-5361df066edb): desc bt_arr
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:col_name, type:string, comment:from deserializer), FieldSchema(name:data_type, type:string, comment:from deserializer), FieldSchema(name:comment, type:string, comment:from deserializer)], properties:null)
INFO : Completed compiling command(queryId=hive_20200325062323_10f71e38-fe8d-4094-aa87-5361df066edb); Time taken: 0.056 seconds
INFO : Concurrency mode is disabled, not creating a lock manager
INFO : Executing command(queryId=hive_20200325062323_10f71e38-fe8d-4094-aa87-5361df066edb): desc bt_arr
INFO : Starting task [Stage-0:DDL] in serial mode
INFO : Completed executing command(queryId=hive_20200325062323_10f71e38-fe8d-4094-aa87-5361df066edb); Time taken: 0.02 seconds
INFO : OK
+-----------+----------------+----------+--+
| col_name | data_type | comment |
+-----------+----------------+----------+--+
| _c0 | array<binary> | |
+-----------+----------------+----------+--+
1 row selected (0.158 seconds)
0: jdbc:hive2://localhost:10000> select * from bt_arr;
INFO : Compiling command(queryId=hive_20200325062323_edfa2e37-a8da-481b-97e9-f4ed9a37b9a4): select * from bt_arr
INFO : Semantic Analysis Completed
INFO : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:bt_arr._c0, type:array<binary>, comment:null)], properties:null)
INFO : Completed compiling command(queryId=hive_20200325062323_edfa2e37-a8da-481b-97e9-f4ed9a37b9a4); Time taken: 0.053 seconds
INFO : Concurrency mode is disabled, not creating a lock manager
INFO : Executing command(queryId=hive_20200325062323_edfa2e37-a8da-481b-97e9-f4ed9a37b9a4): select * from bt_arr
INFO : Completed executing command(queryId=hive_20200325062323_edfa2e37-a8da-481b-97e9-f4ed9a37b9a4); Time taken: 0.001 seconds
INFO : OK
+-------------+--+
| bt_arr._c0 |
+-------------+--+
| [spark] |
| [hello] |
| [3] |
| [.] |
| [0] |
+-------------+--+
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe the new behaviour is more sensible than the previous - returning just [B@ba4f370 was useless; returning the String made from Binary blob at least returns some content... I know some systems try to parse back these results as JSON to be able to explore the nested data - e.g. I think PowerBI does that with what thriftserver returns... I fear that if the binary has some strange contents, and is unquoted, it will break that JSON parsing... But I haven't tested it. |
||
| case (decimal: java.math.BigDecimal, DecimalType()) => decimal.toPlainString | ||
| case (n, _: NumericType) => n.toString | ||
| case (s: String, StringType) => if (nested) "\"" + s + "\"" else s | ||
| case (interval: CalendarInterval, CalendarIntervalType) => | ||
| SQLConf.get.intervalOutputStyle match { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not related to this PR, but we should also simplify it. We can output SQL standard format if ansi mode is enabled, and output multi-unit format otherwise.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we are doing so, I suggest we support SQL standard input first |
||
| case SQL_STANDARD => toSqlStandardString(interval) | ||
| case ISO_8601 => toIso8601String(interval) | ||
| case MULTI_UNITS => toMultiUnitsString(interval) | ||
| } | ||
| case (interval, CalendarIntervalType) => interval.toString | ||
| case (other, _ : UserDefinedType[_]) => other.toString | ||
| case (other, tpe) if primitiveTypes.contains(tpe) => other.toString | ||
| case (seq: Seq[_], ArrayType(typ, _)) => | ||
| seq.map(v => (v, typ)).map(e => toHiveString(e, true)).mkString("[", ",", "]") | ||
| case (m: Map[_, _], MapType(kType, vType, _)) => | ||
| m.map { case (key, value) => | ||
| toHiveString((key, kType), true) + ":" + toHiveString((value, vType), true) | ||
| }.toSeq.sorted.mkString("{", ",", "}") | ||
| case (struct: Row, StructType(fields)) => | ||
| struct.toSeq.zip(fields).map { case (v, t) => | ||
| s""""${t.name}":${toHiveString((v, t.dataType), true)}""" | ||
| }.mkString("{", ",", "}") | ||
| case (other, _: UserDefinedType[_]) => other.toString | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@yaooqinn @cloud-fan
for ArrayType(ByteType), this was returning something like "[B@ba4f370" (by calling toString on the Array[Byte]), after this change it returns the String created via
new String(bin, StandardCharsets.UTF_8)Which of these is the intended output?
[B@ba4f370is useless, while the string made from binary at least contains the contents of it, so it seems that after this change it at least makes more sense...