diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
index 0e94073e4773c..1f64547da7415 100644
--- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
+++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala
@@ -33,7 +33,7 @@ import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8}
class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ignore funsuite
- val currentIcuVersion: String = "75.1"
+ val currentIcuVersion: String = "76.1"
test("collationId stability") {
assert(INDETERMINATE_COLLATION_ID == -1)
diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3
index 7723d42423d6d..d75760967ba11 100644
--- a/dev/deps/spark-deps-hadoop-3-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3-hive-2.3
@@ -100,7 +100,7 @@ hk2-locator/3.0.6//hk2-locator-3.0.6.jar
hk2-utils/3.0.6//hk2-utils-3.0.6.jar
httpclient/4.5.14//httpclient-4.5.14.jar
httpcore/4.4.16//httpcore-4.4.16.jar
-icu4j/75.1//icu4j-75.1.jar
+icu4j/76.1//icu4j-76.1.jar
ini4j/0.5.4//ini4j-0.5.4.jar
istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar
ivy/2.5.2//ivy-2.5.2.jar
diff --git a/pom.xml b/pom.xml
index ea139ecfe6939..de9fb94244997 100644
--- a/pom.xml
+++ b/pom.xml
@@ -217,7 +217,7 @@
6.1.1
4.1.110.Final
2.0.66.Final
- 75.1
+ 76.1
5.11.0
1.11.0
0.13.0
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala
index a30f604550a38..77a3d6df69221 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala
@@ -168,6 +168,8 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
}
test("CollationKey generates correct collation key for collated string") {
+ // In version `75.1`, its value is 0x2A (42), while in version `76.1`, its value is 0x2B (43)
+ val b: Byte = 0x2B
val testCases = Seq(
("", "UTF8_BINARY", UTF8String.fromString("").getBytes),
("aa", "UTF8_BINARY", UTF8String.fromString("aa").getBytes),
@@ -180,15 +182,15 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
(" AA ", "UTF8_LCASE_RTRIM", UTF8String.fromString(" aa").getBytes),
("aA", "UTF8_LCASE", UTF8String.fromString("aa").getBytes),
("", "UNICODE", Array[Byte](1, 1, 0)),
- ("aa", "UNICODE", Array[Byte](42, 42, 1, 6, 1, 6, 0)),
- ("AA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -36, -36, 0)),
- ("aA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -59, -36, 0)),
- ("aa ", "UNICODE_RTRIM", Array[Byte](42, 42, 1, 6, 1, 6, 0)),
+ ("aa", "UNICODE", Array[Byte](b, b, 1, 6, 1, 6, 0)),
+ ("AA", "UNICODE", Array[Byte](b, b, 1, 6, 1, -36, -36, 0)),
+ ("aA", "UNICODE", Array[Byte](b, b, 1, 6, 1, -59, -36, 0)),
+ ("aa ", "UNICODE_RTRIM", Array[Byte](b, b, 1, 6, 1, 6, 0)),
("", "UNICODE_CI", Array[Byte](1, 0)),
- ("aa", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)),
- ("aa ", "UNICODE_CI_RTRIM", Array[Byte](42, 42, 1, 6, 0)),
- ("AA", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)),
- ("aA", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0))
+ ("aa", "UNICODE_CI", Array[Byte](b, b, 1, 6, 0)),
+ ("aa ", "UNICODE_CI_RTRIM", Array[Byte](b, b, 1, 6, 0)),
+ ("AA", "UNICODE_CI", Array[Byte](b, b, 1, 6, 0)),
+ ("aA", "UNICODE_CI", Array[Byte](b, b, 1, 6, 0))
)
for ((input, collation, expected) <- testCases) {
val str = Literal.create(input, StringType(collation))
diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
index b2df218c8fbb4..da06b0209d0a6 100644
--- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
@@ -1,54 +1,54 @@
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
--------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 1353 1353 1 0.1 13526.6 1.0X
-UTF8_LCASE 2703 2705 3 0.0 27032.4 2.0X
-UNICODE 16848 16894 65 0.0 168482.9 12.5X
-UNICODE_CI 16362 16367 8 0.0 163615.6 12.1X
+UTF8_BINARY 1349 1349 0 0.1 13485.4 1.0X
+UTF8_LCASE 3559 3561 3 0.0 35594.3 2.6X
+UNICODE 17580 17589 12 0.0 175803.6 13.0X
+UNICODE_CI 17210 17212 2 0.0 172100.2 12.8X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
---------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 2640 2642 3 0.0 26401.5 1.0X
-UTF8_LCASE 3616 3618 2 0.0 36164.8 1.4X
-UNICODE 17465 17470 7 0.0 174650.9 6.6X
-UNICODE_CI 17251 17264 18 0.0 172510.9 6.5X
+UTF8_BINARY 1740 1741 1 0.1 17398.8 1.0X
+UTF8_LCASE 2630 2632 3 0.0 26301.0 1.5X
+UNICODE 16732 16743 16 0.0 167319.7 9.6X
+UNICODE_CI 16482 16492 14 0.0 164819.7 9.5X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 2843 2844 1 0.0 28427.2 1.0X
-UTF8_LCASE 5417 5437 28 0.0 54170.7 1.9X
-UNICODE 68601 68619 26 0.0 686010.8 24.1X
-UNICODE_CI 56342 56361 26 0.0 563422.2 19.8X
+UTF8_BINARY 2808 2808 0 0.0 28082.3 1.0X
+UTF8_LCASE 5412 5413 1 0.0 54123.5 1.9X
+UNICODE 70755 70787 44 0.0 707553.4 25.2X
+UNICODE_CI 57639 57669 43 0.0 576390.0 20.5X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 7674 7674 1 0.0 76735.3 1.0X
-UTF8_LCASE 20367 20376 14 0.0 203665.1 2.7X
-UNICODE 377133 377909 1098 0.0 3771328.8 49.1X
-UNICODE_CI 434710 435099 551 0.0 4347097.2 56.7X
+UTF8_BINARY 9356 9357 0 0.0 93564.9 1.0X
+UTF8_LCASE 24106 24129 33 0.0 241055.3 2.6X
+UNICODE 368428 369053 883 0.0 3684284.1 39.4X
+UNICODE_CI 417361 418242 1246 0.0 4173613.9 44.6X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 6956 6959 4 0.0 69561.7 1.0X
-UTF8_LCASE 14246 14262 23 0.0 142459.6 2.0X
-UNICODE 369940 370072 186 0.0 3699400.9 53.2X
-UNICODE_CI 442072 442365 414 0.0 4420718.1 63.6X
+UTF8_BINARY 10941 10943 2 0.0 109411.5 1.0X
+UTF8_LCASE 20041 20058 24 0.0 200410.1 1.8X
+UNICODE 364296 365610 1859 0.0 3642958.8 33.3X
+UNICODE_CI 424306 424888 823 0.0 4243062.7 38.8X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 6927 6927 0 0.0 69265.2 1.0X
-UTF8_LCASE 15505 15514 12 0.0 155054.5 2.2X
-UNICODE 382361 382426 93 0.0 3823606.6 55.2X
-UNICODE_CI 449956 450063 151 0.0 4499562.9 65.0X
+UTF8_BINARY 10551 10556 7 0.0 105511.7 1.0X
+UTF8_LCASE 20294 20300 9 0.0 202943.7 1.9X
+UNICODE 384070 384554 684 0.0 3840704.6 36.4X
+UNICODE_CI 441935 442184 352 0.0 4419351.4 41.9X
diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt
index a63b80f005ed0..c79fef5f4cf65 100644
--- a/sql/core/benchmarks/CollationBenchmark-results.txt
+++ b/sql/core/benchmarks/CollationBenchmark-results.txt
@@ -1,54 +1,54 @@
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
--------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 1372 1372 1 0.1 13718.5 1.0X
-UTF8_LCASE 3115 3116 1 0.0 31154.4 2.3X
-UNICODE 19813 19820 9 0.0 198132.2 14.4X
-UNICODE_CI 19669 19686 24 0.0 196694.2 14.3X
+UTF8_BINARY 1372 1372 1 0.1 13715.2 1.0X
+UTF8_LCASE 3847 3851 6 0.0 38467.3 2.8X
+UNICODE 19659 19662 4 0.0 196587.1 14.3X
+UNICODE_CI 19663 19666 3 0.0 196634.5 14.3X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
---------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 1727 1728 1 0.1 17271.3 1.0X
-UTF8_LCASE 3034 3035 1 0.0 30337.2 1.8X
-UNICODE 19230 19243 18 0.0 192301.2 11.1X
-UNICODE_CI 19080 19082 3 0.0 190802.0 11.0X
+UTF8_BINARY 1706 1707 3 0.1 17056.0 1.0X
+UTF8_LCASE 4016 4016 0 0.0 40164.0 2.4X
+UNICODE 19545 19547 3 0.0 195453.4 11.5X
+UNICODE_CI 19544 19547 5 0.0 195437.5 11.5X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 3080 3080 0 0.0 30796.4 1.0X
-UTF8_LCASE 6436 6454 25 0.0 64360.0 2.1X
-UNICODE 68095 68167 101 0.0 680951.3 22.1X
-UNICODE_CI 62122 62123 2 0.0 621215.8 20.2X
+UTF8_BINARY 3091 3092 1 0.0 30909.8 1.0X
+UTF8_LCASE 6286 6287 2 0.0 62856.0 2.0X
+UNICODE 65495 65528 47 0.0 654945.7 21.2X
+UNICODE_CI 59987 59994 10 0.0 599868.6 19.4X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 8260 8261 1 0.0 82604.0 1.0X
-UTF8_LCASE 23629 23629 0 0.0 236286.4 2.9X
-UNICODE 364843 366078 1747 0.0 3648427.9 44.2X
-UNICODE_CI 425728 426449 1020 0.0 4257275.1 51.5X
+UTF8_BINARY 13707 13726 27 0.0 137069.4 1.0X
+UTF8_LCASE 28660 28685 36 0.0 286598.9 2.1X
+UNICODE 363134 364168 1462 0.0 3631341.3 26.5X
+UNICODE_CI 412158 412229 100 0.0 4121577.8 30.1X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 6844 6848 5 0.0 68440.4 1.0X
-UTF8_LCASE 21849 21870 30 0.0 218486.3 3.2X
-UNICODE 363474 363811 476 0.0 3634738.4 53.1X
-UNICODE_CI 427563 428029 659 0.0 4275629.8 62.5X
+UTF8_BINARY 12200 12205 8 0.0 121998.8 1.0X
+UTF8_LCASE 27626 27633 9 0.0 276263.6 2.3X
+UNICODE 350755 351083 464 0.0 3507553.8 28.8X
+UNICODE_CI 409383 410380 1410 0.0 4093834.8 33.6X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 6904 6907 4 0.0 69039.3 1.0X
-UTF8_LCASE 22007 22009 3 0.0 220067.8 3.2X
-UNICODE 376402 377858 2060 0.0 3764015.4 54.5X
-UNICODE_CI 444485 444809 458 0.0 4444850.8 64.4X
+UTF8_BINARY 11879 11887 12 0.0 118786.3 1.0X
+UTF8_LCASE 27743 27759 22 0.0 277434.4 2.3X
+UNICODE 368435 368478 61 0.0 3684351.2 31.0X
+UNICODE_CI 426350 426503 216 0.0 4263497.6 35.9X
diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
index 574e3c5359100..5fd9e5c0dd084 100644
--- a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
@@ -1,54 +1,54 @@
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
--------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 165 165 0 0.2 4118.0 1.0X
-UTF8_LCASE 6996 7019 33 0.0 174899.5 42.5X
-UNICODE 5395 5407 18 0.0 134874.5 32.8X
-UNICODE_CI 5670 5672 2 0.0 141756.7 34.4X
+UTF8_BINARY 171 171 0 0.2 4273.2 1.0X
+UTF8_LCASE 7382 7393 16 0.0 184558.6 43.2X
+UNICODE 5337 5342 7 0.0 133424.6 31.2X
+UNICODE_CI 5090 5093 4 0.0 127259.3 29.8X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
---------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 306 306 0 0.1 7656.1 1.0X
-UTF8_LCASE 6950 6957 11 0.0 173739.0 22.7X
-UNICODE 5120 5123 3 0.0 128010.6 16.7X
-UNICODE_CI 5080 5099 27 0.0 127011.6 16.6X
+UTF8_BINARY 318 319 1 0.1 7952.7 1.0X
+UTF8_LCASE 7065 7072 10 0.0 176621.0 22.2X
+UNICODE 5294 5297 4 0.0 132357.7 16.6X
+UNICODE_CI 5246 5248 2 0.0 131156.9 16.5X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 384 384 1 0.1 9591.1 1.0X
-UTF8_LCASE 3549 3550 2 0.0 88721.7 9.3X
-UNICODE 14143 14145 3 0.0 353570.2 36.9X
-UNICODE_CI 11925 11929 6 0.0 298126.4 31.1X
+UTF8_BINARY 385 387 2 0.1 9635.7 1.0X
+UTF8_LCASE 3559 3560 1 0.0 88974.0 9.2X
+UNICODE 15525 15542 25 0.0 388119.0 40.3X
+UNICODE_CI 12479 12482 5 0.0 311967.1 32.4X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 1375 1376 1 0.0 34375.4 1.0X
-UTF8_LCASE 8740 8744 6 0.0 218504.1 6.4X
-UNICODE 68707 68818 158 0.0 1717667.1 50.0X
-UNICODE_CI 77167 77197 42 0.0 1929168.6 56.1X
+UTF8_BINARY 1669 1671 2 0.0 41724.7 1.0X
+UTF8_LCASE 10199 10202 4 0.0 254974.8 6.1X
+UNICODE 68754 68758 6 0.0 1718852.9 41.2X
+UNICODE_CI 76056 76086 42 0.0 1901404.1 45.6X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 1064 1065 2 0.0 26587.9 1.0X
-UTF8_LCASE 5820 5827 10 0.0 145506.0 5.5X
-UNICODE 67636 67675 54 0.0 1690904.3 63.6X
-UNICODE_CI 77750 77796 65 0.0 1943738.2 73.1X
+UTF8_BINARY 1785 1787 3 0.0 44623.9 1.0X
+UTF8_LCASE 6859 6861 3 0.0 171482.2 3.8X
+UNICODE 68545 68553 12 0.0 1713617.2 38.4X
+UNICODE_CI 76486 76520 48 0.0 1912155.4 42.9X
-OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 1090 1091 0 0.0 27260.9 1.0X
-UTF8_LCASE 6049 6054 7 0.0 151221.3 5.5X
-UNICODE 74589 74633 62 0.0 1864725.7 68.4X
-UNICODE_CI 83674 83708 49 0.0 2091841.0 76.7X
+UTF8_BINARY 1760 1761 1 0.0 44004.8 1.0X
+UTF8_LCASE 6680 6681 1 0.0 166995.3 3.8X
+UNICODE 74294 74346 74 0.0 1857344.0 42.2X
+UNICODE_CI 81377 81447 98 0.0 2034434.9 46.2X
diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
index d4e70f29c245b..101d8c3a614dc 100644
--- a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
+++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
@@ -1,54 +1,54 @@
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
--------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 133 133 1 0.3 3317.1 1.0X
-UTF8_LCASE 7092 7097 6 0.0 177310.9 53.5X
-UNICODE 5946 5966 29 0.0 148638.1 44.8X
-UNICODE_CI 5715 5717 2 0.0 142885.1 43.1X
+UTF8_BINARY 125 128 6 0.3 3119.4 1.0X
+UTF8_LCASE 6635 6701 93 0.0 165887.1 53.2X
+UNICODE 5195 5217 31 0.0 129878.9 41.6X
+UNICODE_CI 5240 5263 33 0.0 131003.7 42.0X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
---------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 433 435 2 0.1 10816.6 1.0X
-UTF8_LCASE 7365 7369 5 0.0 184135.4 17.0X
-UNICODE 5785 5790 7 0.0 144616.9 13.4X
-UNICODE_CI 5742 5744 3 0.0 143557.1 13.3X
+UTF8_BINARY 416 419 2 0.1 10410.2 1.0X
+UTF8_LCASE 6909 6910 2 0.0 172713.2 16.6X
+UNICODE 5444 5480 50 0.0 136112.1 13.1X
+UNICODE_CI 5444 5470 37 0.0 136087.6 13.1X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 410 411 1 0.1 10246.1 1.0X
-UTF8_LCASE 3588 3589 1 0.0 89698.8 8.8X
-UNICODE 15788 15802 20 0.0 394702.8 38.5X
-UNICODE_CI 12179 12192 19 0.0 304466.6 29.7X
+UTF8_BINARY 380 386 4 0.1 9510.3 1.0X
+UTF8_LCASE 3390 3398 10 0.0 84756.7 8.9X
+UNICODE 13224 13243 27 0.0 330604.1 34.8X
+UNICODE_CI 10524 10635 157 0.0 263095.0 27.7X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 1367 1370 4 0.0 34182.9 1.0X
-UTF8_LCASE 9644 9645 1 0.0 241101.2 7.1X
-UNICODE 67169 67171 3 0.0 1679230.1 49.1X
-UNICODE_CI 79077 79209 188 0.0 1976919.1 57.8X
+UTF8_BINARY 1740 1758 26 0.0 43489.5 1.0X
+UTF8_LCASE 10697 10708 15 0.0 267435.0 6.1X
+UNICODE 61284 61521 336 0.0 1532092.7 35.2X
+UNICODE_CI 70030 70051 29 0.0 1750749.7 40.3X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 1064 1067 3 0.0 26608.1 1.0X
-UTF8_LCASE 6487 6491 4 0.0 162186.5 6.1X
-UNICODE 68473 68523 71 0.0 1711818.5 64.3X
-UNICODE_CI 79374 79419 64 0.0 1984338.0 74.6X
+UTF8_BINARY 1663 1675 18 0.0 41563.3 1.0X
+UTF8_LCASE 6786 6787 2 0.0 169640.2 4.1X
+UNICODE 60580 60668 124 0.0 1514498.6 36.4X
+UNICODE_CI 71018 71018 0 0.0 1775444.7 42.7X
-OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure
+OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure
AMD EPYC 7763 64-Core Processor
collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time
------------------------------------------------------------------------------------------------------------------------
-UTF8_BINARY 1002 1004 2 0.0 25061.8 1.0X
-UTF8_LCASE 6052 6052 0 0.0 151298.7 6.0X
-UNICODE 74506 74551 64 0.0 1862644.2 74.3X
-UNICODE_CI 83607 83756 211 0.0 2090164.5 83.4X
+UTF8_BINARY 1778 1779 2 0.0 44450.1 1.0X
+UTF8_LCASE 6786 6822 50 0.0 169657.6 3.8X
+UNICODE 67562 67633 101 0.0 1689054.6 38.0X
+UNICODE_CI 75378 75919 765 0.0 1884460.0 42.4X
diff --git a/sql/core/src/test/resources/collations/ICU-collations-map.md b/sql/core/src/test/resources/collations/ICU-collations-map.md
index a704034c694aa..6308571009bd8 100644
--- a/sql/core/src/test/resources/collations/ICU-collations-map.md
+++ b/sql/core/src/test/resources/collations/ICU-collations-map.md
@@ -90,55 +90,58 @@
| 85 | nl |
| 86 | nn |
| 87 | no |
-| 88 | om |
-| 89 | or |
-| 90 | pa |
-| 91 | pa_Guru |
-| 92 | pa_Guru_IND |
-| 93 | pl |
-| 94 | ps |
-| 95 | pt |
-| 96 | ro |
-| 97 | ru |
-| 98 | sa |
-| 99 | se |
-| 100 | si |
-| 101 | sk |
-| 102 | sl |
-| 103 | smn |
-| 104 | sq |
-| 105 | sr |
-| 106 | sr_Cyrl |
-| 107 | sr_Cyrl_BIH |
-| 108 | sr_Cyrl_MNE |
-| 109 | sr_Cyrl_SRB |
-| 110 | sr_Latn |
-| 111 | sr_Latn_BIH |
-| 112 | sr_Latn_SRB |
-| 113 | sv |
-| 114 | sw |
-| 115 | ta |
-| 116 | te |
-| 117 | th |
-| 118 | tk |
-| 119 | to |
-| 120 | tr |
-| 121 | ug |
-| 122 | uk |
-| 123 | ur |
-| 124 | uz |
-| 125 | vi |
-| 126 | wae |
-| 127 | wo |
-| 128 | xh |
-| 129 | yi |
-| 130 | yo |
-| 131 | zh |
-| 132 | zh_Hans |
-| 133 | zh_Hans_CHN |
-| 134 | zh_Hans_SGP |
-| 135 | zh_Hant |
-| 136 | zh_Hant_HKG |
-| 137 | zh_Hant_MAC |
-| 138 | zh_Hant_TWN |
-| 139 | zu |
+| 88 | nso |
+| 89 | om |
+| 90 | or |
+| 91 | pa |
+| 92 | pa_Guru |
+| 93 | pa_Guru_IND |
+| 94 | pl |
+| 95 | ps |
+| 96 | pt |
+| 97 | ro |
+| 98 | ru |
+| 99 | sa |
+| 100 | se |
+| 101 | si |
+| 102 | sk |
+| 103 | sl |
+| 104 | smn |
+| 105 | sq |
+| 106 | sr |
+| 107 | sr_Cyrl |
+| 108 | sr_Cyrl_BIH |
+| 109 | sr_Cyrl_MNE |
+| 110 | sr_Cyrl_SRB |
+| 111 | sr_Latn |
+| 112 | sr_Latn_BIH |
+| 113 | sr_Latn_SRB |
+| 114 | st |
+| 115 | sv |
+| 116 | sw |
+| 117 | ta |
+| 118 | te |
+| 119 | th |
+| 120 | tk |
+| 121 | tn |
+| 122 | to |
+| 123 | tr |
+| 124 | ug |
+| 125 | uk |
+| 126 | ur |
+| 127 | uz |
+| 128 | vi |
+| 129 | wae |
+| 130 | wo |
+| 131 | xh |
+| 132 | yi |
+| 133 | yo |
+| 134 | zh |
+| 135 | zh_Hans |
+| 136 | zh_Hans_CHN |
+| 137 | zh_Hans_SGP |
+| 138 | zh_Hant |
+| 139 | zh_Hant_HKG |
+| 140 | zh_Hant_MAC |
+| 141 | zh_Hant_TWN |
+| 142 | zu |
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala
index adfc5b703da47..3563e04dced11 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala
@@ -200,10 +200,10 @@ class CollationSQLExpressionsSuite
Murmur3HashTestCase("Spark ", "UTF8_BINARY_RTRIM", 1779328737),
Murmur3HashTestCase("Spark", "UTF8_LCASE", -1928694360),
Murmur3HashTestCase("Spark ", "UTF8_LCASE_RTRIM", -1928694360),
- Murmur3HashTestCase("SQL", "UNICODE", -1923567940),
- Murmur3HashTestCase("SQL ", "UNICODE_RTRIM", -1923567940),
- Murmur3HashTestCase("SQL", "UNICODE_CI", 1029527950),
- Murmur3HashTestCase("SQL ", "UNICODE_CI_RTRIM", 1029527950)
+ Murmur3HashTestCase("SQL", "UNICODE", 1483684981),
+ Murmur3HashTestCase("SQL ", "UNICODE_RTRIM", 1483684981),
+ Murmur3HashTestCase("SQL", "UNICODE_CI", 279787709),
+ Murmur3HashTestCase("SQL ", "UNICODE_CI_RTRIM", 279787709)
)
// Supported collations
@@ -232,10 +232,10 @@ class CollationSQLExpressionsSuite
XxHash64TestCase("Spark ", "UTF8_BINARY_RTRIM", 6480371823304753502L),
XxHash64TestCase("Spark", "UTF8_LCASE", -3142112654825786434L),
XxHash64TestCase("Spark ", "UTF8_LCASE_RTRIM", -3142112654825786434L),
- XxHash64TestCase("SQL", "UNICODE", 5964849564945649886L),
- XxHash64TestCase("SQL ", "UNICODE_RTRIM", 5964849564945649886L),
- XxHash64TestCase("SQL", "UNICODE_CI", 3732497619779520590L),
- XxHash64TestCase("SQL ", "UNICODE_CI_RTRIM", 3732497619779520590L)
+ XxHash64TestCase("SQL", "UNICODE", 7549349329256749019L),
+ XxHash64TestCase("SQL ", "UNICODE_RTRIM", 7549349329256749019L),
+ XxHash64TestCase("SQL", "UNICODE_CI", -3010409544364398863L),
+ XxHash64TestCase("SQL ", "UNICODE_CI_RTRIM", -3010409544364398863L)
)
// Supported collations
@@ -3147,7 +3147,7 @@ class CollationSQLExpressionsSuite
HyperLogLogPlusPlusTestCase("utf8_lcase", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA",
"aA", "Aa", "aa"), Seq(Row(5))),
HyperLogLogPlusPlusTestCase("UNICODE", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA",
- "aA", "Aa", "aa"), Seq(Row(10))),
+ "aA", "Aa", "aa"), Seq(Row(9))),
HyperLogLogPlusPlusTestCase("UNICODE_CI", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA",
"aA", "Aa", "aa"), Seq(Row(5)))
)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
index 3a5a650e5a0c4..d69ba77a14750 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
@@ -1979,27 +1979,28 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
// verify that the output ordering is as expected (UTF8_BINARY, UTF8_LCASE, etc.)
val df = sql("SELECT * FROM collations() limit 10")
+ val icvVersion = "76.1.0.0"
checkAnswer(df,
Seq(Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UTF8_LCASE", null, null,
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UNICODE", "", "",
- "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "UNICODE_AI", "", "",
- "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "UNICODE_CI", "", "",
- "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "UNICODE_CI_AI", "", "",
- "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "af", "Afrikaans", "",
- "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "af_AI", "Afrikaans", "",
- "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "af_CI", "Afrikaans", "",
- "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "af_CI_AI", "Afrikaans", "",
- "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion)))
checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE '%UTF8_BINARY%'"),
Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
@@ -2007,34 +2008,34 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE '%zh_Hant_HKG%'"),
Seq(Row("SYSTEM", "BUILTIN", "zh_Hant_HKG", "Chinese", "Hong Kong SAR China",
- "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_AI", "Chinese", "Hong Kong SAR China",
- "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI", "Chinese", "Hong Kong SAR China",
- "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI_AI", "Chinese", "Hong Kong SAR China",
- "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion)))
checkAnswer(sql("SELECT * FROM collations() WHERE COUNTRY = 'Singapore'"),
Seq(Row("SYSTEM", "BUILTIN", "zh_Hans_SGP", "Chinese", "Singapore",
- "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_AI", "Chinese", "Singapore",
- "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI", "Chinese", "Singapore",
- "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI_AI", "Chinese", "Singapore",
- "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion)))
checkAnswer(sql("SELECT * FROM collations() WHERE LANGUAGE = 'English' " +
"and COUNTRY = 'United States'"),
Seq(Row("SYSTEM", "BUILTIN", "en_USA", "English", "United States",
- "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "en_USA_AI", "English", "United States",
- "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "en_USA_CI", "English", "United States",
- "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
+ "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion),
Row("SYSTEM", "BUILTIN", "en_USA_CI_AI", "English", "United States",
- "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))
+ "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion)))
checkAnswer(sql("SELECT NAME, LANGUAGE, ACCENT_SENSITIVITY, CASE_SENSITIVITY " +
"FROM collations() WHERE COUNTRY = 'United States'"),