apache · yaooqinn · Jun 4, 2020 · Jun 4, 2020 · cloud-fan · Jun 4, 2020
diff --git a/docs/sql-ref-datetime-pattern.md b/docs/sql-ref-datetime-pattern.md
@@ -40,7 +40,7 @@ Spark uses pattern letters in the following table for date and timestamp parsing
 |**w**|week-of-week-based-year|number(2)|27|
 |**W**|week-of-month|number(1)|4|
 |**E**|day-of-week|text|Tue; Tuesday|
-|**u**|localized day-of-week|number/text|2; 02; Tue; Tuesday|
+|**u**|day-of-week|number/text|2; 02; Tue; Tuesday|
 |**F**|week-of-month|number(1)|3|
 |**a**|am-pm-of-day|am-pm|PM|
 |**h**|clock-hour-of-am-pm (1-12)|number(2)|12|
@@ -63,7 +63,10 @@ Spark uses pattern letters in the following table for date and timestamp parsing
 
 The count of pattern letters determines the format.
 
-- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short form. Exactly 4 pattern letters will use the full form. Exactly 5 pattern letters will use the narrow form. 5 or more letters will fail.
+- Text: The text style is determined based on the number of pattern letters used. Less than 4 pattern letters will use the short form. Exactly 4 pattern letters will use the full form. Exactly 5 pattern letters will use the narrow form. 5 or more letters will fail. More details for the text style:
+
+  - Short Form: Short text, typically an abbreviation. For example, day-of-week Monday might output "Mon".
+  - Full Form: Full text, typically the full description. For example, day-of-week Monday might output "Monday".
 
 - Number(n): The n here represents the maximum count of letters this type of datetime pattern can be used. If the count of letters is one, then the value is output using the minimum number of digits and without padding. Otherwise, the count of digits is used as the width of the output field, with the value zero-padded as necessary.
 
@@ -138,9 +141,3 @@ The count of pattern letters determines the format.
   An optional section is started by `[` and ended using `]` (or at the end of the pattern).
 
 - Symbols of 'Y', 'W', 'w', 'E', 'u', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.
-
-More details for the text style:
-
-- Short Form: Short text, typically an abbreviation. For example, day-of-week Monday might output "Mon".
-
-- Full Form: Full text, typically the full description. For example, day-of-week Monday might output "Monday".
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -117,7 +117,9 @@ class LegacySimpleDateFormatter(pattern: String, locale: Locale) extends LegacyD
 object DateFormatter {
   import LegacyDateFormats._
 
-  val defaultLocale: Locale = Locale.US
+  import DateTimeFormatterHelper._
+
+  presetSundayStartToMondayStart()
 
   val defaultPattern: String = "yyyy-MM-dd"
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.sql.catalyst.util
 
+import java.lang.reflect.{Field, Modifier}
 import java.time._
 import java.time.chrono.IsoChronology
 import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, ResolverStyle}
-import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries}
+import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries, WeekFields}
 import java.util.Locale
+import java.util.concurrent.ConcurrentHashMap
 
 import com.google.common.cache.CacheBuilder
 
@@ -155,7 +157,22 @@ trait DateTimeFormatterHelper {
   }
 }
 
-private object DateTimeFormatterHelper {
+private[spark] object DateTimeFormatterHelper {
+  val defaultLocale = Locale.US
+
+  def presetSundayStartToMondayStart(): Unit = {
+    val CACHE: Field = classOf[WeekFields].getDeclaredField("CACHE")
+    CACHE.setAccessible(true)
+    val modifiers: Field = CACHE.getClass.getDeclaredField("modifiers")
+    modifiers.setAccessible(true)
+    modifiers.setInt(CACHE, CACHE.getModifiers & ~Modifier.FINAL)
+    val newCache = new ConcurrentHashMap[String, WeekFields]()
+    // Preset the Sunday start entry to ISO-based Monday start instance for retrieving first day
+    // of week
+    newCache.put(DayOfWeek.SUNDAY.toString + 1, WeekFields.ISO)
+    CACHE.set(null, newCache)
+  }
+
   val cache = CacheBuilder.newBuilder()
     .maximumSize(128)
     .build[(String, Locale, Boolean), DateTimeFormatter]()
@@ -222,7 +239,7 @@ private object DateTimeFormatterHelper {
       .appendValue(ChronoField.MINUTE_OF_HOUR, 2).appendLiteral(':')
       .appendValue(ChronoField.SECOND_OF_MINUTE, 2)
       .appendFraction(ChronoField.NANO_OF_SECOND, 0, 9, true)
-    toFormatter(builder, TimestampFormatter.defaultLocale)
+    toFormatter(builder, defaultLocale)
   }
 
   private final val bugInStandAloneForm = {

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -29,6 +29,7 @@ import java.util.concurrent.TimeUnit.SECONDS
 import org.apache.commons.lang3.time.FastDateFormat
 
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
+import org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.catalyst.util.LegacyDateFormats.{LegacyDateFormat, LENIENT_SIMPLE_DATE_FORMAT}
 import org.apache.spark.sql.catalyst.util.RebaseDateTime._
@@ -120,7 +121,7 @@ class FractionTimestampFormatter(zoneId: ZoneId)
   extends Iso8601TimestampFormatter(
     TimestampFormatter.defaultPattern,
     zoneId,
-    TimestampFormatter.defaultLocale,
+    defaultLocale,
     LegacyDateFormats.FAST_DATE_FORMAT,
     needVarLengthSecondFraction = false) {
 
@@ -278,7 +279,7 @@ object LegacyDateFormats extends Enumeration {
 object TimestampFormatter {
   import LegacyDateFormats._
 
-  val defaultLocale: Locale = Locale.US
+  presetSundayStartToMondayStart()
 
   def defaultPattern(): String = s"${DateFormatter.defaultPattern} HH:mm:ss"
 

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala
@@ -21,7 +21,7 @@ import java.time.{DateTimeException, LocalDate}
 
 import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
 import org.apache.spark.sql.catalyst.plans.SQLHelper
-import org.apache.spark.sql.catalyst.util.{DateFormatter, LegacyDateFormats}
+import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeFormatterHelper, LegacyDateFormats}
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.internal.SQLConf
@@ -71,7 +71,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
                 val formatter = DateFormatter(
                   DateFormatter.defaultPattern,
                   getZoneId(timeZone),
-                  DateFormatter.defaultLocale,
+                  DateTimeFormatterHelper.defaultLocale,
                   legacyFormat,
                   isParsing = false)
                 val days = formatter.parse(date)
@@ -106,7 +106,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
                 val formatter = DateFormatter(
                   DateFormatter.defaultPattern,
                   getZoneId(timeZone),
-                  DateFormatter.defaultLocale,
+                  DateTimeFormatterHelper.defaultLocale,
                   legacyFormat,
                   isParsing = false)
                 val date = formatter.format(days)
@@ -174,7 +174,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
             val formatter = DateFormatter(
               DateFormatter.defaultPattern,
               getZoneId(timeZone),
-              DateFormatter.defaultLocale,
+              DateTimeFormatterHelper.defaultLocale,
               legacyFormat,
               isParsing = false)
             assert(LocalDate.ofEpochDay(formatter.parse("1000-01-01")) === LocalDate.of(1000, 1, 1))

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala
@@ -25,6 +25,7 @@ import org.scalatest.Matchers
 import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
 import org.apache.spark.sql.catalyst.plans.SQLHelper
 import org.apache.spark.sql.catalyst.util.{LegacyDateFormats, TimestampFormatter}
+import org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper._
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.internal.SQLConf
@@ -290,7 +291,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
                 TimestampFormatter(
                   TimestampFormatter.defaultPattern,
                   zoneId,
-                  TimestampFormatter.defaultLocale,
+                  defaultLocale,
                   legacyFormat,
                   isParsing = false)
               }.toSeq :+ TimestampFormatter.getFractionFormatter(zoneId)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala
@@ -22,7 +22,7 @@ import java.sql.{Date, Timestamp}
 import java.time.{Instant, LocalDate, ZoneOffset}
 
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, LegacyDateFormats, TimestampFormatter}
+import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeFormatterHelper, DateTimeUtils, LegacyDateFormats, TimestampFormatter}
 import org.apache.spark.sql.execution.command.{DescribeCommandBase, ExecutedCommandExec, ShowTablesCommand, ShowViewsCommand}
 import org.apache.spark.sql.execution.datasources.v2.{DescribeTableExec, ShowTablesExec}
 import org.apache.spark.sql.internal.SQLConf
@@ -84,7 +84,7 @@ object HiveResult {
     format = DateFormatter.defaultPattern,
     // We can set any time zone id. UTC was taken for simplicity.
     zoneId = ZoneOffset.UTC,
-    locale = DateFormatter.defaultLocale,
+    locale = DateTimeFormatterHelper.defaultLocale,
     // Use `FastDateFormat` as the legacy formatter because it is thread-safe.
     legacyFormat = LegacyDateFormats.FAST_DATE_FORMAT,
     isParsing = false)

diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
@@ -164,3 +164,8 @@ select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy
 select from_unixtime(1, 'yyyyyyyyyyy-MM-dd');
 select date_format(timestamp '2018-11-17 13:33:33', 'yyyyyyyyyy-MM-dd HH:mm:ss');
 select date_format(date '2018-11-17', 'yyyyyyyyyyy-MM-dd');
+
+select date_format(date '2019-12-29', 'YYYY-ww-uu');
+select date_format('2020-01-01', 'YYYY-MM-dd uu');
+select date_format('2020-01-01', 'YYYY-MM-dd uuuu');
+SELECT to_csv(named_struct('time', to_timestamp('2019-12-29', 'yyyy-MM-dd')), map('timestampFormat', 'YYYY-ww-uu', 'locale', 'en-US'));
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 119
+-- Number of queries: 123
 
 
 -- !query
@@ -1025,3 +1025,35 @@ struct<>
 -- !query output
 org.apache.spark.SparkUpgradeException
 You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyyyyyyyyy-MM-dd' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
+
+-- !query
+select date_format(date '2019-12-29', 'YYYY-ww-uu')
+-- !query schema
+struct<date_format(CAST(DATE '2019-12-29' AS TIMESTAMP), YYYY-ww-uu):string>
+-- !query output
+2019-52-07
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uu):string>
+-- !query output
+2020-01-01 03
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uuuu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uuuu):string>
+-- !query output
+2020-01-01 Wednesday
+
+
+-- !query
+SELECT to_csv(named_struct('time', to_timestamp('2019-12-29', 'yyyy-MM-dd')), map('timestampFormat', 'YYYY-ww-uu', 'locale', 'en-US'))
+-- !query schema
+struct<to_csv(named_struct(time, to_timestamp(2019-12-29, yyyy-MM-dd))):string>
+-- !query output
+2019-52-07
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 119
+-- Number of queries: 123
 
 
 -- !query
@@ -980,3 +980,35 @@ select date_format(date '2018-11-17', 'yyyyyyyyyyy-MM-dd')
 struct<date_format(CAST(DATE '2018-11-17' AS TIMESTAMP), yyyyyyyyyyy-MM-dd):string>
 -- !query output
 00000002018-11-17
+
+
+-- !query
+select date_format(date '2019-12-29', 'YYYY-ww-uu')
+-- !query schema
+struct<date_format(CAST(DATE '2019-12-29' AS TIMESTAMP), YYYY-ww-uu):string>
+-- !query output
+2020-01-07
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uu):string>
+-- !query output
+2020-01-01 03
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uuuu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uuuu):string>
+-- !query output
+2020-01-01 0003
+
+
+-- !query
+SELECT to_csv(named_struct('time', to_timestamp('2019-12-29', 'yyyy-MM-dd')), map('timestampFormat', 'YYYY-ww-uu', 'locale', 'en-US'))
+-- !query schema
+struct<to_csv(named_struct(time, to_timestamp(2019-12-29, yyyy-MM-dd))):string>
+-- !query output
+2020-01-07
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 119
+-- Number of queries: 123
 
 
 -- !query
@@ -997,3 +997,35 @@ struct<>
 -- !query output
 org.apache.spark.SparkUpgradeException
 You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyyyyyyyyy-MM-dd' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
+
+-- !query
+select date_format(date '2019-12-29', 'YYYY-ww-uu')
+-- !query schema
+struct<date_format(CAST(DATE '2019-12-29' AS TIMESTAMP), YYYY-ww-uu):string>
+-- !query output
+2019-52-07
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uu):string>
+-- !query output
+2020-01-01 03
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uuuu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uuuu):string>
+-- !query output
+2020-01-01 Wednesday
+
+
+-- !query
+SELECT to_csv(named_struct('time', to_timestamp('2019-12-29', 'yyyy-MM-dd')), map('timestampFormat', 'YYYY-ww-uu', 'locale', 'en-US'))
+-- !query schema
+struct<to_csv(named_struct(time, to_timestamp(2019-12-29, yyyy-MM-dd))):string>
+-- !query output
+2019-52-07