-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-3407][SQL]Add Date type support #2344
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2dfbb5b
17fda35
0e3110e
4ddcb92
0e0a4f5
f8f219f
5429212
c37e848
617d1a8
30bf48b
aa96735
bb1b1ef
0df6ea1
00fe81f
2038085
f15074a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,18 +17,21 @@ | |
|
|
||
| package org.apache.spark.sql.catalyst.expressions | ||
|
|
||
| import java.sql.Timestamp | ||
| import java.sql.{Date, Timestamp} | ||
| import java.text.{DateFormat, SimpleDateFormat} | ||
|
|
||
| import org.apache.spark.Logging | ||
| import org.apache.spark.sql.catalyst.errors.TreeNodeException | ||
| import org.apache.spark.sql.catalyst.types._ | ||
|
|
||
| /** Cast the child expression to the target data type. */ | ||
| case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | ||
| case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with Logging { | ||
| override def foldable = child.foldable | ||
|
|
||
| override def nullable = (child.dataType, dataType) match { | ||
| case (StringType, _: NumericType) => true | ||
| case (StringType, TimestampType) => true | ||
| case (StringType, DateType) => true | ||
| case _ => child.nullable | ||
| } | ||
|
|
||
|
|
@@ -42,6 +45,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| // UDFToString | ||
| private[this] def castToString: Any => Any = child.dataType match { | ||
| case BinaryType => buildCast[Array[Byte]](_, new String(_, "UTF-8")) | ||
| case DateType => buildCast[Date](_, dateToString) | ||
| case TimestampType => buildCast[Timestamp](_, timestampToString) | ||
| case _ => buildCast[Any](_, _.toString) | ||
| } | ||
|
|
@@ -56,7 +60,10 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| case StringType => | ||
| buildCast[String](_, _.length() != 0) | ||
| case TimestampType => | ||
| buildCast[Timestamp](_, b => b.getTime() != 0 || b.getNanos() != 0) | ||
| buildCast[Timestamp](_, t => t.getTime() != 0 || t.getNanos() != 0) | ||
| case DateType => | ||
| // Hive would return null when cast from date to boolean | ||
| buildCast[Date](_, d => null) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Leaving a comment here would be good. It's really unintuitive here to see a timestamp can be casted to a boolean while a date has to be null. |
||
| case LongType => | ||
| buildCast[Long](_, _ != 0) | ||
| case IntegerType => | ||
|
|
@@ -95,6 +102,8 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| buildCast[Short](_, s => new Timestamp(s)) | ||
| case ByteType => | ||
| buildCast[Byte](_, b => new Timestamp(b)) | ||
| case DateType => | ||
| buildCast[Date](_, d => new Timestamp(d.getTime)) | ||
| // TimestampWritable.decimalToTimestamp | ||
| case DecimalType => | ||
| buildCast[BigDecimal](_, d => decimalToTimestamp(d)) | ||
|
|
@@ -130,7 +139,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| // Converts Timestamp to string according to Hive TimestampWritable convention | ||
| private[this] def timestampToString(ts: Timestamp): String = { | ||
| val timestampString = ts.toString | ||
| val formatted = Cast.threadLocalDateFormat.get.format(ts) | ||
| val formatted = Cast.threadLocalTimestampFormat.get.format(ts) | ||
|
|
||
| if (timestampString.length > 19 && timestampString.substring(19) != ".0") { | ||
| formatted + timestampString.substring(19) | ||
|
|
@@ -139,13 +148,48 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| } | ||
| } | ||
|
|
||
| // Converts Timestamp to string according to Hive TimestampWritable convention | ||
| private[this] def timestampToDateString(ts: Timestamp): String = { | ||
| Cast.threadLocalDateFormat.get.format(ts) | ||
| } | ||
|
|
||
| // DateConverter | ||
| private[this] def castToDate: Any => Any = child.dataType match { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We probably need to compare the date conversion with Hive, https://github.com/apache/hive/blob/branch-0.12/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/primitive/PrimitiveObjectInspectorUtils.java#L1032 Particularly for |
||
| case StringType => | ||
| buildCast[String](_, s => | ||
| try Date.valueOf(s) catch { case _: java.lang.IllegalArgumentException => null } | ||
| ) | ||
| case TimestampType => | ||
| // throw valid precision more than seconds, according to Hive. | ||
| // Timestamp.nanos is in 0 to 999,999,999, no more than a second. | ||
| buildCast[Timestamp](_, t => new Date(Math.floor(t.getTime / 1000.0).toLong * 1000)) | ||
| // Hive throws this exception as a Semantic Exception | ||
| // It is never possible to compare result when hive return with exception, so we can return null | ||
| // NULL is more reasonable here, since the query itself obeys the grammar. | ||
| case _ => _ => null | ||
| } | ||
|
|
||
| // Date cannot be cast to long, according to hive | ||
| private[this] def dateToLong(d: Date) = null | ||
|
|
||
| // Date cannot be cast to double, according to hive | ||
| private[this] def dateToDouble(d: Date) = null | ||
|
|
||
| // Converts Date to string according to Hive DateWritable convention | ||
| private[this] def dateToString(d: Date): String = { | ||
| Cast.threadLocalDateFormat.get.format(d) | ||
| } | ||
|
|
||
| // LongConverter | ||
| private[this] def castToLong: Any => Any = child.dataType match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toLong catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
| buildCast[Boolean](_, b => if (b) 1L else 0L) | ||
| case DateType => | ||
| buildCast[Date](_, d => dateToLong(d)) | ||
| case TimestampType => | ||
| buildCast[Timestamp](_, t => timestampToLong(t)) | ||
| case DecimalType => | ||
|
|
@@ -154,13 +198,16 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| b => x.numeric.asInstanceOf[Numeric[Any]].toLong(b) | ||
| } | ||
|
|
||
| // IntConverter | ||
| private[this] def castToInt: Any => Any = child.dataType match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toInt catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
| buildCast[Boolean](_, b => if (b) 1 else 0) | ||
| case DateType => | ||
| buildCast[Date](_, d => dateToLong(d)) | ||
| case TimestampType => | ||
| buildCast[Timestamp](_, t => timestampToLong(t).toInt) | ||
| case DecimalType => | ||
|
|
@@ -169,13 +216,16 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| b => x.numeric.asInstanceOf[Numeric[Any]].toInt(b) | ||
| } | ||
|
|
||
| // ShortConverter | ||
| private[this] def castToShort: Any => Any = child.dataType match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toShort catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
| buildCast[Boolean](_, b => if (b) 1.toShort else 0.toShort) | ||
| case DateType => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hive returns NULL when cast from DATE to INT., etc |
||
| buildCast[Date](_, d => dateToLong(d)) | ||
| case TimestampType => | ||
| buildCast[Timestamp](_, t => timestampToLong(t).toShort) | ||
| case DecimalType => | ||
|
|
@@ -184,13 +234,16 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| b => x.numeric.asInstanceOf[Numeric[Any]].toInt(b).toShort | ||
| } | ||
|
|
||
| // ByteConverter | ||
| private[this] def castToByte: Any => Any = child.dataType match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toByte catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
| buildCast[Boolean](_, b => if (b) 1.toByte else 0.toByte) | ||
| case DateType => | ||
| buildCast[Date](_, d => dateToLong(d)) | ||
| case TimestampType => | ||
| buildCast[Timestamp](_, t => timestampToLong(t).toByte) | ||
| case DecimalType => | ||
|
|
@@ -199,27 +252,33 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| b => x.numeric.asInstanceOf[Numeric[Any]].toInt(b).toByte | ||
| } | ||
|
|
||
| // DecimalConverter | ||
| private[this] def castToDecimal: Any => Any = child.dataType match { | ||
| case StringType => | ||
| buildCast[String](_, s => try BigDecimal(s.toDouble) catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
| buildCast[Boolean](_, b => if (b) BigDecimal(1) else BigDecimal(0)) | ||
| case DateType => | ||
| buildCast[Date](_, d => dateToDouble(d)) | ||
| case TimestampType => | ||
| // Note that we lose precision here. | ||
| buildCast[Timestamp](_, t => BigDecimal(timestampToDouble(t))) | ||
| case x: NumericType => | ||
| b => BigDecimal(x.numeric.asInstanceOf[Numeric[Any]].toDouble(b)) | ||
| } | ||
|
|
||
| // DoubleConverter | ||
| private[this] def castToDouble: Any => Any = child.dataType match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toDouble catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
| buildCast[Boolean](_, b => if (b) 1d else 0d) | ||
| case DateType => | ||
| buildCast[Date](_, d => dateToDouble(d)) | ||
| case TimestampType => | ||
| buildCast[Timestamp](_, t => timestampToDouble(t)) | ||
| case DecimalType => | ||
|
|
@@ -228,13 +287,16 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| b => x.numeric.asInstanceOf[Numeric[Any]].toDouble(b) | ||
| } | ||
|
|
||
| // FloatConverter | ||
| private[this] def castToFloat: Any => Any = child.dataType match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toFloat catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
| buildCast[Boolean](_, b => if (b) 1f else 0f) | ||
| case DateType => | ||
| buildCast[Date](_, d => dateToDouble(d)) | ||
| case TimestampType => | ||
| buildCast[Timestamp](_, t => timestampToDouble(t).toFloat) | ||
| case DecimalType => | ||
|
|
@@ -245,17 +307,18 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
|
|
||
| private[this] lazy val cast: Any => Any = dataType match { | ||
| case dt if dt == child.dataType => identity[Any] | ||
| case StringType => castToString | ||
| case BinaryType => castToBinary | ||
| case DecimalType => castToDecimal | ||
| case StringType => castToString | ||
| case BinaryType => castToBinary | ||
| case DecimalType => castToDecimal | ||
| case DateType => castToDate | ||
| case TimestampType => castToTimestamp | ||
| case BooleanType => castToBoolean | ||
| case ByteType => castToByte | ||
| case ShortType => castToShort | ||
| case IntegerType => castToInt | ||
| case FloatType => castToFloat | ||
| case LongType => castToLong | ||
| case DoubleType => castToDouble | ||
| case BooleanType => castToBoolean | ||
| case ByteType => castToByte | ||
| case ShortType => castToShort | ||
| case IntegerType => castToInt | ||
| case FloatType => castToFloat | ||
| case LongType => castToLong | ||
| case DoubleType => castToDouble | ||
| } | ||
|
|
||
| override def eval(input: Row): Any = { | ||
|
|
@@ -267,6 +330,13 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression { | |
| object Cast { | ||
| // `SimpleDateFormat` is not thread-safe. | ||
| private[sql] val threadLocalDateFormat = new ThreadLocal[DateFormat] { | ||
| override def initialValue() = { | ||
| new SimpleDateFormat("yyyy-MM-dd") | ||
| } | ||
| } | ||
|
|
||
| // `SimpleDateFormat` is not thread-safe. | ||
| private[sql] val threadLocalTimestampFormat = new ThreadLocal[DateFormat] { | ||
| override def initialValue() = { | ||
| new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,7 @@ | |
|
|
||
| package org.apache.spark.sql.catalyst.types | ||
|
|
||
| import java.sql.Timestamp | ||
| import java.sql.{Date, Timestamp} | ||
|
|
||
| import scala.math.Numeric.{BigDecimalAsIfIntegral, DoubleAsIfIntegral, FloatAsIfIntegral} | ||
| import scala.reflect.ClassTag | ||
|
|
@@ -250,6 +250,16 @@ case object TimestampType extends NativeType { | |
| } | ||
| } | ||
|
|
||
| case object DateType extends NativeType { | ||
| private[sql] type JvmType = Date | ||
|
|
||
| @transient private[sql] lazy val tag = ScalaReflectionLock.synchronized { typeTag[JvmType] } | ||
|
|
||
| private[sql] val ordering = new Ordering[JvmType] { | ||
| def compare(x: Date, y: Date) = x.compareTo(y) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've checked the logic of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we also need to modify
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is only used for ordering, not for data comparison. |
||
| } | ||
| } | ||
|
|
||
| abstract class NumericType extends NativeType with PrimitiveType { | ||
| // Unfortunately we can't get this implicitly as that breaks Spark Serialization. In order for | ||
| // implicitly[Numeric[JvmType]] to be valid, we have to change JvmType from a type variable to a | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about turning
Date/Timestampcomparison toLongcomparison? String and long representations ofTimestampare both accurate to seconds.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems cast('1970-01-01' as date) < cast('1970-01-01 00:00:00' as timestamp)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK... verified this behavior with Hive, I've no idea about this :(
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So Michael agreed to leave the whole ordering and comparing stuff in a separated PR :)