-
Notifications
You must be signed in to change notification settings - Fork 29.1k
[SPARK-46536][SQL] Support GROUP BY calendar_interval_type #44538
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9c06dd4
cc127e8
635a8a0
f071a49
52d52c2
df2a24d
533d176
5c4de2b
9095245
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,7 +44,7 @@ | |
| * @since 3.0.0 | ||
| */ | ||
| @Unstable | ||
| public final class CalendarInterval implements Serializable { | ||
| public final class CalendarInterval implements Serializable, Comparable<CalendarInterval> { | ||
| // NOTE: If you're moving or renaming this file, you should also update Unidoc configuration | ||
| // specified in 'SparkBuild.scala'. | ||
| public final int months; | ||
|
|
@@ -127,4 +127,26 @@ private void appendUnit(StringBuilder sb, long value, String unit) { | |
| * @throws ArithmeticException if a numeric overflow occurs | ||
| */ | ||
| public Duration extractAsDuration() { return Duration.of(microseconds, ChronoUnit.MICROS); } | ||
|
|
||
| /** | ||
| * This method is not used to order CalendarInterval instances, as they are not orderable and | ||
| * cannot be used in a ORDER BY statement. | ||
| * Instead, it is used to find identical interval instances for aggregation purposes. | ||
| * It compares the 'months', 'days', and 'microseconds' fields of this CalendarInterval | ||
| * with another instance. The comparison is done first on the 'months', then on the 'days', | ||
| * and finally on the 'microseconds'. | ||
| * | ||
| * @param o The CalendarInterval instance to compare with. | ||
| * @return Zero if this object is equal to the specified object, and non-zero otherwise | ||
| */ | ||
| @Override | ||
| public int compareTo(CalendarInterval o) { | ||
| if (this.months != o.months) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comparing intervals does not necessarily short circuits via months. We could result in Besides, 1 month can be 28 ~ 30 days, making the legacy calendar interval type uncomparable
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should add some comments to explain that this is alphabet ordering. It does not have actual meaning but just makes it possible to find identical interval instances. We should do the same thing for map type so that we can group by map values.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @stefankandic did you generate this using IDEA?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added the comments. @cloud-fan method was generated by intellij but I implemented the logic |
||
| return Integer.compare(this.months, o.months); | ||
| } else if (this.days != o.days) { | ||
| return Integer.compare(this.days, o.days); | ||
| } else { | ||
| return Long.compare(this.microseconds, o.microseconds); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,7 @@ import org.apache.spark.sql.test.SQLTestData.DecimalData | |
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.sql.types.DayTimeIntervalType.{DAY, HOUR, MINUTE, SECOND} | ||
| import org.apache.spark.sql.types.YearMonthIntervalType.{MONTH, YEAR} | ||
| import org.apache.spark.unsafe.types.CalendarInterval | ||
|
|
||
| case class Fact(date: Int, hour: Int, minute: Int, room_name: String, temp: Double) | ||
|
|
||
|
|
@@ -2125,6 +2126,37 @@ class DataFrameAggregateSuite extends QueryTest | |
| Seq(Row(1)) | ||
| ) | ||
| } | ||
|
|
||
| test("SPARK-46536 Support GROUP BY CalendarIntervalType") { | ||
| val numRows = 50 | ||
| val configurations = Seq( | ||
| Seq.empty[(String, String)], // hash aggregate is used by default | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also need to set
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Had to disable codegen in order to hit the fallback logic, but hopefully it now tests it properly |
||
| Seq(SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN", | ||
| "spark.sql.TungstenAggregate.testFallbackStartsAt" -> "1, 10"), | ||
| Seq("spark.sql.test.forceApplyObjectHashAggregate" -> "true"), | ||
| Seq( | ||
| "spark.sql.test.forceApplyObjectHashAggregate" -> "true", | ||
| SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "1"), | ||
| Seq("spark.sql.test.forceApplySortAggregate" -> "true") | ||
| ) | ||
|
|
||
| val dfSame = (0 until numRows) | ||
| .map(_ => Tuple1(new CalendarInterval(1, 2, 3))) | ||
| .toDF("c0") | ||
|
|
||
| val dfDifferent = (0 until numRows) | ||
| .map(i => Tuple1(new CalendarInterval(i, i, i))) | ||
| .toDF("c0") | ||
|
|
||
| for (conf <- configurations) { | ||
| withSQLConf(conf: _*) { | ||
| assert(createAggregate(dfSame).count() == 1) | ||
| assert(createAggregate(dfDifferent).count() == numRows) | ||
| } | ||
| } | ||
|
|
||
| def createAggregate(df: DataFrame): DataFrame = df.groupBy("c0").agg(count("*")) | ||
| } | ||
| } | ||
|
|
||
| case class B(c: Option[Double]) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interesting. Please check the behavior
#27262
I'm not sure. @yaooqinn