diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index f568a931ae1f..5b3cc0940d9c 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1723,7 +1723,7 @@ setMethod("radians", #' @details #' \code{to_date}: Converts the column into a DateType. You may optionally specify #' a format according to the rules in: -#' \url{http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html}. +#' \url{https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html}. #' If the string cannot be parsed according to the specified format (or default), #' the value of the column will be null. #' By default, it follows casting rules to a DateType if the format is omitted @@ -1819,7 +1819,7 @@ setMethod("to_csv", signature(x = "Column"), #' @details #' \code{to_timestamp}: Converts the column into a TimestampType. You may optionally specify #' a format according to the rules in: -#' \url{http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html}. +#' \url{https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html}. #' If the string cannot be parsed according to the specified format (or default), #' the value of the column will be null. #' By default, it follows casting rules to a TimestampType if the format is omitted @@ -2240,7 +2240,7 @@ setMethod("n", signature(x = "Column"), #' \code{date_format}: Converts a date/timestamp/string to a value of string in the format #' specified by the date format given by the second argument. A pattern could be for instance #' \code{dd.MM.yyyy} and could return a string like '18.03.1993'. All -#' pattern letters of \code{java.text.SimpleDateFormat} can be used. +#' pattern letters of \code{java.time.format.DateTimeFormatter} can be used. #' Note: Use when ever possible specialized functions like \code{year}. These benefit from a #' specialized implementation. #' @@ -2666,7 +2666,7 @@ setMethod("format_string", signature(format = "character", x = "Column"), #' \code{from_unixtime}: Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) #' to a string representing the timestamp of that moment in the current system time zone in the JVM #' in the given format. -#' See \href{http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html}{ +#' See \href{https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html}{ #' Customizing Formats} for available options. #' #' @rdname column_datetime_functions diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md index 1bd3b5ad0e1a..c4d2157de8b6 100644 --- a/docs/sql-migration-guide-upgrade.md +++ b/docs/sql-migration-guide-upgrade.md @@ -39,6 +39,7 @@ displayTitle: Spark SQL Upgrading Guide - In Spark version 2.4 and earlier, JSON datasource and JSON functions like `from_json` convert a bad JSON record to a row with all `null`s in the PERMISSIVE mode when specified schema is `StructType`. Since Spark 3.0, the returned row can contain non-`null` fields if some of JSON column values were parsed and converted to desired types successfully. + - Since Spark 3.0, the `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions use java.time API for parsing and formatting dates/timestamps from/to strings by using ISO chronology (https://docs.oracle.com/javase/8/docs/api/java/time/chrono/IsoChronology.html) based on Proleptic Gregorian calendar. In Spark version 2.4 and earlier, java.text.SimpleDateFormat and java.util.GregorianCalendar (hybrid calendar that supports both the Julian and Gregorian calendar systems, see https://docs.oracle.com/javase/7/docs/api/java/util/GregorianCalendar.html) is used for the same purpuse. New implementation supports pattern formats as described here https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html and performs strict checking of its input. For example, the `2015-07-22 10:00:00` timestamp cannot be parse if pattern is `yyyy-MM-dd` because the parser does not consume whole input. Another example is the `31/01/2015 00:00` input cannot be parsed by the `dd/MM/yyyy hh:mm` pattern because `hh` supposes hours in the range `1-12`. To switch back to the implementation used in Spark 2.4 and earlier, set `spark.sql.legacy.timeParser.enabled` to `true`. ## Upgrading From Spark SQL 2.3 to 2.4 - In Spark version 2.3 and earlier, the second parameter to array_contains function is implicitly promoted to the element type of first array type parameter. This type promotion can be lossy and may cause `array_contains` function to return wrong result. This problem has been addressed in 2.4 by employing a safer type promotion mechanism. This can cause some change in behavior and are illustrated in the table below. diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index d188de39e21c..d2a771e9bb8e 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -874,7 +874,7 @@ def date_format(date, format): format given by the second argument. A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All - pattern letters of the Java class `java.text.SimpleDateFormat` can be used. + pattern letters of the Java class `java.time.format.DateTimeFormatter` can be used. .. note:: Use when ever possible specialized functions like `year`. These benefit from a specialized implementation. @@ -1094,7 +1094,7 @@ def to_date(col, format=None): """Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType` using the optionally specified format. Specify formats according to - `SimpleDateFormats `_. + `DateTimeFormatter `_. # noqa By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format is omitted (equivalent to ``col.cast("date")``). @@ -1119,7 +1119,7 @@ def to_timestamp(col, format=None): """Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType` using the optionally specified format. Specify formats according to - `SimpleDateFormats `_. + `DateTimeFormatter `_. # noqa By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format is omitted (equivalent to ``col.cast("timestamp")``). diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala index 35ade136cc60..4dd41042856d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala @@ -27,8 +27,7 @@ import org.apache.spark.sql.types._ class CSVInferSchema(val options: CSVOptions) extends Serializable { - @transient - private lazy val timestampParser = TimestampFormatter( + private val timestampParser = TimestampFormatter( options.timestampFormat, options.timeZone, options.locale) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 45e17ae235a9..73af0a3c5c2e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -18,8 +18,7 @@ package org.apache.spark.sql.catalyst.expressions import java.sql.Timestamp -import java.text.DateFormat -import java.util.{Calendar, TimeZone} +import java.util.{Calendar, Locale, TimeZone} import scala.util.control.NonFatal @@ -28,7 +27,8 @@ import org.apache.commons.lang3.StringEscapeUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.catalyst.util.{DateTimeUtils, TimestampFormatter} +import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} @@ -562,16 +562,17 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti copy(timeZoneId = Option(timeZoneId)) override protected def nullSafeEval(timestamp: Any, format: Any): Any = { - val df = DateTimeUtils.newDateFormat(format.toString, timeZone) - UTF8String.fromString(df.format(new java.util.Date(timestamp.asInstanceOf[Long] / 1000))) + val df = TimestampFormatter(format.toString, timeZone, Locale.US) + UTF8String.fromString(df.format(timestamp.asInstanceOf[Long])) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") + val tf = TimestampFormatter.getClass.getName.stripSuffix("$") val tz = ctx.addReferenceObj("timeZone", timeZone) + val locale = ctx.addReferenceObj("locale", Locale.US) defineCodeGen(ctx, ev, (timestamp, format) => { - s"""UTF8String.fromString($dtu.newDateFormat($format.toString(), $tz) - .format(new java.util.Date($timestamp / 1000)))""" + s"""UTF8String.fromString($tf.apply($format.toString(), $tz, $locale) + .format($timestamp))""" }) } @@ -612,9 +613,10 @@ case class ToUnixTimestamp( } /** - * Converts time string with given pattern. - * (see [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html]) - * to Unix time stamp (in seconds), returns null if fail. + * Converts time string with given pattern to Unix time stamp (in seconds), returns null if fail. + * See [http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html] + * if SQL config spark.sql.legacy.timeParser.enabled is set to true otherwise + * [https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html]. * Note that hive Language Manual says it returns 0 if fail, but in fact it returns null. * If the second parameter is missing, use "yyyy-MM-dd HH:mm:ss". * If no parameters provided, the first parameter will be current_timestamp. @@ -663,9 +665,9 @@ abstract class UnixTime override def nullable: Boolean = true private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String] - private lazy val formatter: DateFormat = + private lazy val formatter: TimestampFormatter = try { - DateTimeUtils.newDateFormat(constFormat.toString, timeZone) + TimestampFormatter(constFormat.toString, timeZone, Locale.US) } catch { case NonFatal(_) => null } @@ -677,16 +679,16 @@ abstract class UnixTime } else { left.dataType match { case DateType => - DateTimeUtils.daysToMillis(t.asInstanceOf[Int], timeZone) / 1000L + DateTimeUtils.daysToMillis(t.asInstanceOf[Int], timeZone) / MILLIS_PER_SECOND case TimestampType => - t.asInstanceOf[Long] / 1000000L + t.asInstanceOf[Long] / MICROS_PER_SECOND case StringType if right.foldable => if (constFormat == null || formatter == null) { null } else { try { formatter.parse( - t.asInstanceOf[UTF8String].toString).getTime / 1000L + t.asInstanceOf[UTF8String].toString) / MICROS_PER_SECOND } catch { case NonFatal(_) => null } @@ -698,8 +700,8 @@ abstract class UnixTime } else { val formatString = f.asInstanceOf[UTF8String].toString try { - DateTimeUtils.newDateFormat(formatString, timeZone).parse( - t.asInstanceOf[UTF8String].toString).getTime / 1000L + TimestampFormatter(formatString, timeZone, Locale.US).parse( + t.asInstanceOf[UTF8String].toString) / MICROS_PER_SECOND } catch { case NonFatal(_) => null } @@ -712,7 +714,7 @@ abstract class UnixTime val javaType = CodeGenerator.javaType(dataType) left.dataType match { case StringType if right.foldable => - val df = classOf[DateFormat].getName + val df = classOf[TimestampFormatter].getName if (formatter == null) { ExprCode.forNullValue(dataType) } else { @@ -724,24 +726,35 @@ abstract class UnixTime $javaType ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; if (!${ev.isNull}) { try { - ${ev.value} = $formatterName.parse(${eval1.value}.toString()).getTime() / 1000L; + ${ev.value} = $formatterName.parse(${eval1.value}.toString()) / 1000000L; + } catch (java.lang.IllegalArgumentException e) { + ${ev.isNull} = true; } catch (java.text.ParseException e) { ${ev.isNull} = true; + } catch (java.time.format.DateTimeParseException e) { + ${ev.isNull} = true; + } catch (java.time.DateTimeException e) { + ${ev.isNull} = true; } }""") } case StringType => val tz = ctx.addReferenceObj("timeZone", timeZone) - val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") + val locale = ctx.addReferenceObj("locale", Locale.US) + val dtu = TimestampFormatter.getClass.getName.stripSuffix("$") nullSafeCodeGen(ctx, ev, (string, format) => { s""" try { - ${ev.value} = $dtu.newDateFormat($format.toString(), $tz) - .parse($string.toString()).getTime() / 1000L; + ${ev.value} = $dtu.apply($format.toString(), $tz, $locale) + .parse($string.toString()) / 1000000L; } catch (java.lang.IllegalArgumentException e) { ${ev.isNull} = true; } catch (java.text.ParseException e) { ${ev.isNull} = true; + } catch (java.time.format.DateTimeParseException e) { + ${ev.isNull} = true; + } catch (java.time.DateTimeException e) { + ${ev.isNull} = true; } """ }) @@ -806,9 +819,9 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[ copy(timeZoneId = Option(timeZoneId)) private lazy val constFormat: UTF8String = right.eval().asInstanceOf[UTF8String] - private lazy val formatter: DateFormat = + private lazy val formatter: TimestampFormatter = try { - DateTimeUtils.newDateFormat(constFormat.toString, timeZone) + TimestampFormatter(constFormat.toString, timeZone, Locale.US) } catch { case NonFatal(_) => null } @@ -823,8 +836,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[ null } else { try { - UTF8String.fromString(formatter.format( - new java.util.Date(time.asInstanceOf[Long] * 1000L))) + UTF8String.fromString(formatter.format(time.asInstanceOf[Long] * MICROS_PER_SECOND)) } catch { case NonFatal(_) => null } @@ -835,8 +847,8 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[ null } else { try { - UTF8String.fromString(DateTimeUtils.newDateFormat(f.toString, timeZone) - .format(new java.util.Date(time.asInstanceOf[Long] * 1000L))) + UTF8String.fromString(TimestampFormatter(f.toString, timeZone, Locale.US) + .format(time.asInstanceOf[Long] * MICROS_PER_SECOND)) } catch { case NonFatal(_) => null } @@ -846,7 +858,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[ } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - val df = classOf[DateFormat].getName + val df = classOf[TimestampFormatter].getName if (format.foldable) { if (formatter == null) { ExprCode.forNullValue(StringType) @@ -859,8 +871,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[ ${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)}; if (!${ev.isNull}) { try { - ${ev.value} = UTF8String.fromString($formatterName.format( - new java.util.Date(${t.value} * 1000L))); + ${ev.value} = UTF8String.fromString($formatterName.format(${t.value} * 1000000L)); } catch (java.lang.IllegalArgumentException e) { ${ev.isNull} = true; } @@ -868,12 +879,13 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[ } } else { val tz = ctx.addReferenceObj("timeZone", timeZone) - val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") + val locale = ctx.addReferenceObj("locale", Locale.US) + val tf = TimestampFormatter.getClass.getName.stripSuffix("$") nullSafeCodeGen(ctx, ev, (seconds, f) => { s""" try { - ${ev.value} = UTF8String.fromString($dtu.newDateFormat($f.toString(), $tz).format( - new java.util.Date($seconds * 1000L))); + ${ev.value} = UTF8String.fromString($tf.apply($f.toString(), $tz, $locale). + format($seconds * 1000000L)); } catch (java.lang.IllegalArgumentException e) { ${ev.isNull} = true; }""" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala index d1bc00c08c1c..3203e626ea40 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala @@ -37,8 +37,7 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable { private val decimalParser = ExprUtils.getDecimalParser(options.locale) - @transient - private lazy val timestampFormatter = TimestampFormatter( + private val timestampFormatter = TimestampFormatter( options.timestampFormat, options.timeZone, options.locale) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala index 9e8d51cc65f0..b4c99674fc1c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala @@ -26,7 +26,7 @@ import org.apache.commons.lang3.time.FastDateFormat import org.apache.spark.sql.internal.SQLConf -sealed trait DateFormatter { +sealed trait DateFormatter extends Serializable { def parse(s: String): Int // returns days since epoch def format(days: Int): String } @@ -35,7 +35,8 @@ class Iso8601DateFormatter( pattern: String, locale: Locale) extends DateFormatter with DateTimeFormatterHelper { - private val formatter = buildFormatter(pattern, locale) + @transient + private lazy val formatter = buildFormatter(pattern, locale) private val UTC = ZoneId.of("UTC") private def toInstant(s: String): Instant = { @@ -56,7 +57,8 @@ class Iso8601DateFormatter( } class LegacyDateFormatter(pattern: String, locale: Locale) extends DateFormatter { - private val format = FastDateFormat.getInstance(pattern, locale) + @transient + private lazy val format = FastDateFormat.getInstance(pattern, locale) override def parse(s: String): Int = { val milliseconds = format.parse(s).getTime diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala index b85101d38d9e..91cc57e0bb01 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala @@ -17,27 +17,36 @@ package org.apache.spark.sql.catalyst.util -import java.time.{Instant, LocalDateTime, ZonedDateTime, ZoneId} -import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} -import java.time.temporal.{ChronoField, TemporalAccessor} +import java.time._ +import java.time.chrono.IsoChronology +import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, ResolverStyle} +import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries} import java.util.Locale trait DateTimeFormatterHelper { protected def buildFormatter(pattern: String, locale: Locale): DateTimeFormatter = { new DateTimeFormatterBuilder() + .parseCaseInsensitive() .appendPattern(pattern) - .parseDefaulting(ChronoField.YEAR_OF_ERA, 1970) + .parseDefaulting(ChronoField.ERA, 1) .parseDefaulting(ChronoField.MONTH_OF_YEAR, 1) .parseDefaulting(ChronoField.DAY_OF_MONTH, 1) - .parseDefaulting(ChronoField.HOUR_OF_DAY, 0) .parseDefaulting(ChronoField.MINUTE_OF_HOUR, 0) .parseDefaulting(ChronoField.SECOND_OF_MINUTE, 0) .toFormatter(locale) + .withChronology(IsoChronology.INSTANCE) + .withResolverStyle(ResolverStyle.STRICT) } protected def toInstantWithZoneId(temporalAccessor: TemporalAccessor, zoneId: ZoneId): Instant = { - val localDateTime = LocalDateTime.from(temporalAccessor) + val localTime = if (temporalAccessor.query(TemporalQueries.localTime) == null) { + LocalTime.ofNanoOfDay(0) + } else { + LocalTime.from(temporalAccessor) + } + val localDate = LocalDate.from(temporalAccessor) + val localDateTime = LocalDateTime.of(localDate, localTime) val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId) Instant.from(zonedDateTime) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index c6dfdbf2505b..3e5e1fbc2b36 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -111,16 +111,6 @@ object DateTimeUtils { computedTimeZones.computeIfAbsent(timeZoneId, computeTimeZone) } - def newDateFormat(formatString: String, timeZone: TimeZone): DateFormat = { - val sdf = new SimpleDateFormat(formatString, Locale.US) - sdf.setTimeZone(timeZone) - // Enable strict parsing, if the input date/format is invalid, it will throw an exception. - // e.g. to parse invalid date '2016-13-12', or '2016-01-12' with invalid format 'yyyy-aa-dd', - // an exception will be throwed. - sdf.setLenient(false) - sdf - } - // we should use the exact day as Int, for example, (year, month, day) -> day def millisToDays(millisUtc: Long): SQLDate = { millisToDays(millisUtc, defaultTimeZone()) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala index eb1303303463..b67b2d7cc3c5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala @@ -17,7 +17,9 @@ package org.apache.spark.sql.catalyst.util +import java.text.ParseException import java.time._ +import java.time.format.DateTimeParseException import java.time.temporal.TemporalQueries import java.util.{Locale, TimeZone} @@ -27,7 +29,19 @@ import org.apache.commons.lang3.time.FastDateFormat import org.apache.spark.sql.internal.SQLConf -sealed trait TimestampFormatter { +sealed trait TimestampFormatter extends Serializable { + /** + * Parses a timestamp in a string and converts it to microseconds. + * + * @param s - string with timestamp to parse + * @return microseconds since epoch. + * @throws ParseException can be thrown by legacy parser + * @throws DateTimeParseException can be thrown by new parser + * @throws DateTimeException unable to obtain local date or time + */ + @throws(classOf[ParseException]) + @throws(classOf[DateTimeParseException]) + @throws(classOf[DateTimeException]) def parse(s: String): Long // returns microseconds since epoch def format(us: Long): String } @@ -36,7 +50,8 @@ class Iso8601TimestampFormatter( pattern: String, timeZone: TimeZone, locale: Locale) extends TimestampFormatter with DateTimeFormatterHelper { - private val formatter = buildFormatter(pattern, locale) + @transient + private lazy val formatter = buildFormatter(pattern, locale) private def toInstant(s: String): Instant = { val temporalAccessor = formatter.parse(s) @@ -68,7 +83,8 @@ class LegacyTimestampFormatter( pattern: String, timeZone: TimeZone, locale: Locale) extends TimestampFormatter { - private val format = FastDateFormat.getInstance(pattern, timeZone, locale) + @transient + private lazy val format = FastDateFormat.getInstance(pattern, timeZone, locale) protected def toMillis(s: String): Long = format.parse(s).getTime diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala index 2d0b0d3033a9..4ae61bc61255 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala @@ -112,7 +112,7 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper { assert(parser.makeConverter("_1", BooleanType).apply("true") == true) var timestampsOptions = - new CSVOptions(Map("timestampFormat" -> "dd/MM/yyyy hh:mm"), false, "GMT") + new CSVOptions(Map("timestampFormat" -> "dd/MM/yyyy HH:mm"), false, "GMT") parser = new UnivocityParser(StructType(Seq.empty), timestampsOptions) val customTimestamp = "31/01/2015 00:00" var format = FastDateFormat.getInstance( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala index 019615b81101..2dc55e0e1f63 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.util +import java.time.LocalDate import java.util.Locale import org.apache.spark.SparkFunSuite @@ -89,4 +90,10 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper { } } } + + test("parsing date without explicit day") { + val formatter = DateFormatter("yyyy MMM", Locale.US) + val daysSinceEpoch = formatter.parse("2018 Dec") + assert(daysSinceEpoch === LocalDate.of(2018, 12, 1).toEpochDay) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala index c110ffa01f73..edccbb2a7f5d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala @@ -17,7 +17,9 @@ package org.apache.spark.sql.util +import java.time.{LocalDateTime, ZoneOffset} import java.util.{Locale, TimeZone} +import java.util.concurrent.TimeUnit import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.plans.SQLHelper @@ -106,4 +108,14 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper { } } } + + test(" case insensitive parsing of am and pm") { + val formatter = TimestampFormatter( + "yyyy MMM dd hh:mm:ss a", + TimeZone.getTimeZone("UTC"), + Locale.US) + val micros = formatter.parse("2009 Mar 20 11:30:01 am") + assert(micros === TimeUnit.SECONDS.toMicros( + LocalDateTime.of(2009, 3, 20, 11, 30, 1).toEpochSecond(ZoneOffset.UTC))) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 33186f778d86..645452553e6a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2578,7 +2578,7 @@ object functions { * Converts a date/timestamp/string to a value of string in the format specified by the date * format given by the second argument. * - * See [[java.text.SimpleDateFormat]] for valid date and time format patterns + * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns * * @param dateExpr A date, timestamp or string. If a string, the data must be in a format that * can be cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` @@ -2811,7 +2811,7 @@ object functions { * representing the timestamp of that moment in the current system time zone in the given * format. * - * See [[java.text.SimpleDateFormat]] for valid date and time format patterns + * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns * * @param ut A number of a type that is castable to a long, such as string or integer. Can be * negative for timestamps before the unix epoch @@ -2855,7 +2855,7 @@ object functions { /** * Converts time string with given pattern to Unix timestamp (in seconds). * - * See [[java.text.SimpleDateFormat]] for valid date and time format patterns + * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns * * @param s A date, timestamp or string. If a string, the data must be in a format that can be * cast to a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` @@ -2883,7 +2883,7 @@ object functions { /** * Converts time string with the given pattern to timestamp. * - * See [[java.text.SimpleDateFormat]] for valid date and time format patterns + * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns * * @param s A date, timestamp or string. If a string, the data must be in a format that can be * cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` @@ -2908,7 +2908,7 @@ object functions { /** * Converts the column into a `DateType` with a specified format * - * See [[java.text.SimpleDateFormat]] for valid date and time format patterns + * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns * * @param e A date, timestamp or string. If a string, the data must be in a format that can be * cast to a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index c4ec7150c407..62bb72dd6ea2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -405,7 +405,7 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext { Row(Date.valueOf("2014-12-31")))) checkAnswer( df.select(to_date(col("s"), "yyyy-MM-dd")), - Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + Seq(Row(null), Row(Date.valueOf("2014-12-31")), Row(null))) // now switch format checkAnswer(