Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8}

class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ignore funsuite

val currentIcuVersion: String = "75.1"
val currentIcuVersion: String = "76.1"

test("collationId stability") {
assert(INDETERMINATE_COLLATION_ID == -1)
Expand Down
2 changes: 1 addition & 1 deletion dev/deps/spark-deps-hadoop-3-hive-2.3
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ hk2-locator/3.0.6//hk2-locator-3.0.6.jar
hk2-utils/3.0.6//hk2-utils-3.0.6.jar
httpclient/4.5.14//httpclient-4.5.14.jar
httpcore/4.4.16//httpcore-4.4.16.jar
icu4j/75.1//icu4j-75.1.jar
icu4j/76.1//icu4j-76.1.jar
ini4j/0.5.4//ini4j-0.5.4.jar
istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar
ivy/2.5.2//ivy-2.5.2.jar
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@
<datasketches.version>6.1.1</datasketches.version>
<netty.version>4.1.110.Final</netty.version>
<netty-tcnative.version>2.0.66.Final</netty-tcnative.version>
<icu4j.version>75.1</icu4j.version>
<icu4j.version>76.1</icu4j.version>
<junit-jupiter.version>5.11.0</junit-jupiter.version>
<junit-platform.version>1.11.0</junit-platform.version>
<sbt-jupiter-interface.version>0.13.0</sbt-jupiter-interface.version>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
}

test("CollationKey generates correct collation key for collated string") {
// In version `75.1`, its value is 0x2A (42), while in version `76.1`, its value is 0x2B (43)
val b: Byte = 0x2B
val testCases = Seq(
("", "UTF8_BINARY", UTF8String.fromString("").getBytes),
("aa", "UTF8_BINARY", UTF8String.fromString("aa").getBytes),
Expand All @@ -180,15 +182,15 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
(" AA ", "UTF8_LCASE_RTRIM", UTF8String.fromString(" aa").getBytes),
("aA", "UTF8_LCASE", UTF8String.fromString("aa").getBytes),
("", "UNICODE", Array[Byte](1, 1, 0)),
("aa", "UNICODE", Array[Byte](42, 42, 1, 6, 1, 6, 0)),
("AA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -36, -36, 0)),
("aA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -59, -36, 0)),
("aa ", "UNICODE_RTRIM", Array[Byte](42, 42, 1, 6, 1, 6, 0)),
("aa", "UNICODE", Array[Byte](b, b, 1, 6, 1, 6, 0)),
("AA", "UNICODE", Array[Byte](b, b, 1, 6, 1, -36, -36, 0)),
("aA", "UNICODE", Array[Byte](b, b, 1, 6, 1, -59, -36, 0)),
("aa ", "UNICODE_RTRIM", Array[Byte](b, b, 1, 6, 1, 6, 0)),
("", "UNICODE_CI", Array[Byte](1, 0)),
("aa", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)),
("aa ", "UNICODE_CI_RTRIM", Array[Byte](42, 42, 1, 6, 0)),
("AA", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)),
("aA", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0))
("aa", "UNICODE_CI", Array[Byte](b, b, 1, 6, 0)),
("aa ", "UNICODE_CI_RTRIM", Array[Byte](b, b, 1, 6, 0)),
("AA", "UNICODE_CI", Array[Byte](b, b, 1, 6, 0)),
("aA", "UNICODE_CI", Array[Byte](b, b, 1, 6, 0))
)
for ((input, collation, expected) <- testCases) {
val str = Literal.create(input, StringType(collation))
Expand Down
60 changes: 30 additions & 30 deletions sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
Original file line number Diff line number Diff line change
@@ -1,54 +1,54 @@
OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
--------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 1353 1353 1 0.1 13526.6 1.0X
UTF8_LCASE 2703 2705 3 0.0 27032.4 2.0X
UNICODE 16848 16894 65 0.0 168482.9 12.5X
UNICODE_CI 16362 16367 8 0.0 163615.6 12.1X
UTF8_BINARY 1349 1349 0 0.1 13485.4 1.0X
UTF8_LCASE 3559 3561 3 0.0 35594.3 2.6X
UNICODE 17580 17589 12 0.0 175803.6 13.0X
UNICODE_CI 17210 17212 2 0.0 172100.2 12.8X

OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
---------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 2640 2642 3 0.0 26401.5 1.0X
UTF8_LCASE 3616 3618 2 0.0 36164.8 1.4X
UNICODE 17465 17470 7 0.0 174650.9 6.6X
UNICODE_CI 17251 17264 18 0.0 172510.9 6.5X
UTF8_BINARY 1740 1741 1 0.1 17398.8 1.0X
UTF8_LCASE 2630 2632 3 0.0 26301.0 1.5X
UNICODE 16732 16743 16 0.0 167319.7 9.6X
UNICODE_CI 16482 16492 14 0.0 164819.7 9.5X

OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 2843 2844 1 0.0 28427.2 1.0X
UTF8_LCASE 5417 5437 28 0.0 54170.7 1.9X
UNICODE 68601 68619 26 0.0 686010.8 24.1X
UNICODE_CI 56342 56361 26 0.0 563422.2 19.8X
UTF8_BINARY 2808 2808 0 0.0 28082.3 1.0X
UTF8_LCASE 5412 5413 1 0.0 54123.5 1.9X
UNICODE 70755 70787 44 0.0 707553.4 25.2X
UNICODE_CI 57639 57669 43 0.0 576390.0 20.5X

OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 7674 7674 1 0.0 76735.3 1.0X
UTF8_LCASE 20367 20376 14 0.0 203665.1 2.7X
UNICODE 377133 377909 1098 0.0 3771328.8 49.1X
UNICODE_CI 434710 435099 551 0.0 4347097.2 56.7X
UTF8_BINARY 9356 9357 0 0.0 93564.9 1.0X
UTF8_LCASE 24106 24129 33 0.0 241055.3 2.6X
UNICODE 368428 369053 883 0.0 3684284.1 39.4X
UNICODE_CI 417361 418242 1246 0.0 4173613.9 44.6X

OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 6956 6959 4 0.0 69561.7 1.0X
UTF8_LCASE 14246 14262 23 0.0 142459.6 2.0X
UNICODE 369940 370072 186 0.0 3699400.9 53.2X
UNICODE_CI 442072 442365 414 0.0 4420718.1 63.6X
UTF8_BINARY 10941 10943 2 0.0 109411.5 1.0X
UTF8_LCASE 20041 20058 24 0.0 200410.1 1.8X
UNICODE 364296 365610 1859 0.0 3642958.8 33.3X
UNICODE_CI 424306 424888 823 0.0 4243062.7 38.8X

OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 6927 6927 0 0.0 69265.2 1.0X
UTF8_LCASE 15505 15514 12 0.0 155054.5 2.2X
UNICODE 382361 382426 93 0.0 3823606.6 55.2X
UNICODE_CI 449956 450063 151 0.0 4499562.9 65.0X
UTF8_BINARY 10551 10556 7 0.0 105511.7 1.0X
UTF8_LCASE 20294 20300 9 0.0 202943.7 1.9X
UNICODE 384070 384554 684 0.0 3840704.6 36.4X
UNICODE_CI 441935 442184 352 0.0 4419351.4 41.9X

60 changes: 30 additions & 30 deletions sql/core/benchmarks/CollationBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -1,54 +1,54 @@
OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
--------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 1372 1372 1 0.1 13718.5 1.0X
UTF8_LCASE 3115 3116 1 0.0 31154.4 2.3X
UNICODE 19813 19820 9 0.0 198132.2 14.4X
UNICODE_CI 19669 19686 24 0.0 196694.2 14.3X
UTF8_BINARY 1372 1372 1 0.1 13715.2 1.0X
UTF8_LCASE 3847 3851 6 0.0 38467.3 2.8X
UNICODE 19659 19662 4 0.0 196587.1 14.3X
UNICODE_CI 19663 19666 3 0.0 196634.5 14.3X

OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
---------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 1727 1728 1 0.1 17271.3 1.0X
UTF8_LCASE 3034 3035 1 0.0 30337.2 1.8X
UNICODE 19230 19243 18 0.0 192301.2 11.1X
UNICODE_CI 19080 19082 3 0.0 190802.0 11.0X
UTF8_BINARY 1706 1707 3 0.1 17056.0 1.0X
UTF8_LCASE 4016 4016 0 0.0 40164.0 2.4X
UNICODE 19545 19547 3 0.0 195453.4 11.5X
UNICODE_CI 19544 19547 5 0.0 195437.5 11.5X

OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 3080 3080 0 0.0 30796.4 1.0X
UTF8_LCASE 6436 6454 25 0.0 64360.0 2.1X
UNICODE 68095 68167 101 0.0 680951.3 22.1X
UNICODE_CI 62122 62123 2 0.0 621215.8 20.2X
UTF8_BINARY 3091 3092 1 0.0 30909.8 1.0X
UTF8_LCASE 6286 6287 2 0.0 62856.0 2.0X
UNICODE 65495 65528 47 0.0 654945.7 21.2X
UNICODE_CI 59987 59994 10 0.0 599868.6 19.4X

OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 8260 8261 1 0.0 82604.0 1.0X
UTF8_LCASE 23629 23629 0 0.0 236286.4 2.9X
UNICODE 364843 366078 1747 0.0 3648427.9 44.2X
UNICODE_CI 425728 426449 1020 0.0 4257275.1 51.5X
UTF8_BINARY 13707 13726 27 0.0 137069.4 1.0X
UTF8_LCASE 28660 28685 36 0.0 286598.9 2.1X
UNICODE 363134 364168 1462 0.0 3631341.3 26.5X
UNICODE_CI 412158 412229 100 0.0 4121577.8 30.1X

OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 6844 6848 5 0.0 68440.4 1.0X
UTF8_LCASE 21849 21870 30 0.0 218486.3 3.2X
UNICODE 363474 363811 476 0.0 3634738.4 53.1X
UNICODE_CI 427563 428029 659 0.0 4275629.8 62.5X
UTF8_BINARY 12200 12205 8 0.0 121998.8 1.0X
UTF8_LCASE 27626 27633 9 0.0 276263.6 2.3X
UNICODE 350755 351083 464 0.0 3507553.8 28.8X
UNICODE_CI 409383 410380 1410 0.0 4093834.8 33.6X

OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
UTF8_BINARY 6904 6907 4 0.0 69039.3 1.0X
UTF8_LCASE 22007 22009 3 0.0 220067.8 3.2X
UNICODE 376402 377858 2060 0.0 3764015.4 54.5X
UNICODE_CI 444485 444809 458 0.0 4444850.8 64.4X
UTF8_BINARY 11879 11887 12 0.0 118786.3 1.0X
UTF8_LCASE 27743 27759 22 0.0 277434.4 2.3X
UNICODE 368435 368478 61 0.0 3684351.2 31.0X
UNICODE_CI 426350 426503 216 0.0 4263497.6 35.9X

Loading