diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala index 0e94073e4773c..1f64547da7415 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala @@ -33,7 +33,7 @@ import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8} class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ignore funsuite - val currentIcuVersion: String = "75.1" + val currentIcuVersion: String = "76.1" test("collationId stability") { assert(INDETERMINATE_COLLATION_ID == -1) diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index 7723d42423d6d..d75760967ba11 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -100,7 +100,7 @@ hk2-locator/3.0.6//hk2-locator-3.0.6.jar hk2-utils/3.0.6//hk2-utils-3.0.6.jar httpclient/4.5.14//httpclient-4.5.14.jar httpcore/4.4.16//httpcore-4.4.16.jar -icu4j/75.1//icu4j-75.1.jar +icu4j/76.1//icu4j-76.1.jar ini4j/0.5.4//ini4j-0.5.4.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.5.2//ivy-2.5.2.jar diff --git a/pom.xml b/pom.xml index ea139ecfe6939..de9fb94244997 100644 --- a/pom.xml +++ b/pom.xml @@ -217,7 +217,7 @@ 6.1.1 4.1.110.Final 2.0.66.Final - 75.1 + 76.1 5.11.0 1.11.0 0.13.0 diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala index a30f604550a38..77a3d6df69221 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationExpressionSuite.scala @@ -168,6 +168,8 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } test("CollationKey generates correct collation key for collated string") { + // In version `75.1`, its value is 0x2A (42), while in version `76.1`, its value is 0x2B (43) + val b: Byte = 0x2B val testCases = Seq( ("", "UTF8_BINARY", UTF8String.fromString("").getBytes), ("aa", "UTF8_BINARY", UTF8String.fromString("aa").getBytes), @@ -180,15 +182,15 @@ class CollationExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { (" AA ", "UTF8_LCASE_RTRIM", UTF8String.fromString(" aa").getBytes), ("aA", "UTF8_LCASE", UTF8String.fromString("aa").getBytes), ("", "UNICODE", Array[Byte](1, 1, 0)), - ("aa", "UNICODE", Array[Byte](42, 42, 1, 6, 1, 6, 0)), - ("AA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -36, -36, 0)), - ("aA", "UNICODE", Array[Byte](42, 42, 1, 6, 1, -59, -36, 0)), - ("aa ", "UNICODE_RTRIM", Array[Byte](42, 42, 1, 6, 1, 6, 0)), + ("aa", "UNICODE", Array[Byte](b, b, 1, 6, 1, 6, 0)), + ("AA", "UNICODE", Array[Byte](b, b, 1, 6, 1, -36, -36, 0)), + ("aA", "UNICODE", Array[Byte](b, b, 1, 6, 1, -59, -36, 0)), + ("aa ", "UNICODE_RTRIM", Array[Byte](b, b, 1, 6, 1, 6, 0)), ("", "UNICODE_CI", Array[Byte](1, 0)), - ("aa", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)), - ("aa ", "UNICODE_CI_RTRIM", Array[Byte](42, 42, 1, 6, 0)), - ("AA", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)), - ("aA", "UNICODE_CI", Array[Byte](42, 42, 1, 6, 0)) + ("aa", "UNICODE_CI", Array[Byte](b, b, 1, 6, 0)), + ("aa ", "UNICODE_CI_RTRIM", Array[Byte](b, b, 1, 6, 0)), + ("AA", "UNICODE_CI", Array[Byte](b, b, 1, 6, 0)), + ("aA", "UNICODE_CI", Array[Byte](b, b, 1, 6, 0)) ) for ((input, collation, expected) <- testCases) { val str = Literal.create(input, StringType(collation)) diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt index b2df218c8fbb4..da06b0209d0a6 100644 --- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt @@ -1,54 +1,54 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 1353 1353 1 0.1 13526.6 1.0X -UTF8_LCASE 2703 2705 3 0.0 27032.4 2.0X -UNICODE 16848 16894 65 0.0 168482.9 12.5X -UNICODE_CI 16362 16367 8 0.0 163615.6 12.1X +UTF8_BINARY 1349 1349 0 0.1 13485.4 1.0X +UTF8_LCASE 3559 3561 3 0.0 35594.3 2.6X +UNICODE 17580 17589 12 0.0 175803.6 13.0X +UNICODE_CI 17210 17212 2 0.0 172100.2 12.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 2640 2642 3 0.0 26401.5 1.0X -UTF8_LCASE 3616 3618 2 0.0 36164.8 1.4X -UNICODE 17465 17470 7 0.0 174650.9 6.6X -UNICODE_CI 17251 17264 18 0.0 172510.9 6.5X +UTF8_BINARY 1740 1741 1 0.1 17398.8 1.0X +UTF8_LCASE 2630 2632 3 0.0 26301.0 1.5X +UNICODE 16732 16743 16 0.0 167319.7 9.6X +UNICODE_CI 16482 16492 14 0.0 164819.7 9.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 2843 2844 1 0.0 28427.2 1.0X -UTF8_LCASE 5417 5437 28 0.0 54170.7 1.9X -UNICODE 68601 68619 26 0.0 686010.8 24.1X -UNICODE_CI 56342 56361 26 0.0 563422.2 19.8X +UTF8_BINARY 2808 2808 0 0.0 28082.3 1.0X +UTF8_LCASE 5412 5413 1 0.0 54123.5 1.9X +UNICODE 70755 70787 44 0.0 707553.4 25.2X +UNICODE_CI 57639 57669 43 0.0 576390.0 20.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 7674 7674 1 0.0 76735.3 1.0X -UTF8_LCASE 20367 20376 14 0.0 203665.1 2.7X -UNICODE 377133 377909 1098 0.0 3771328.8 49.1X -UNICODE_CI 434710 435099 551 0.0 4347097.2 56.7X +UTF8_BINARY 9356 9357 0 0.0 93564.9 1.0X +UTF8_LCASE 24106 24129 33 0.0 241055.3 2.6X +UNICODE 368428 369053 883 0.0 3684284.1 39.4X +UNICODE_CI 417361 418242 1246 0.0 4173613.9 44.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 6956 6959 4 0.0 69561.7 1.0X -UTF8_LCASE 14246 14262 23 0.0 142459.6 2.0X -UNICODE 369940 370072 186 0.0 3699400.9 53.2X -UNICODE_CI 442072 442365 414 0.0 4420718.1 63.6X +UTF8_BINARY 10941 10943 2 0.0 109411.5 1.0X +UTF8_LCASE 20041 20058 24 0.0 200410.1 1.8X +UNICODE 364296 365610 1859 0.0 3642958.8 33.3X +UNICODE_CI 424306 424888 823 0.0 4243062.7 38.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 6927 6927 0 0.0 69265.2 1.0X -UTF8_LCASE 15505 15514 12 0.0 155054.5 2.2X -UNICODE 382361 382426 93 0.0 3823606.6 55.2X -UNICODE_CI 449956 450063 151 0.0 4499562.9 65.0X +UTF8_BINARY 10551 10556 7 0.0 105511.7 1.0X +UTF8_LCASE 20294 20300 9 0.0 202943.7 1.9X +UNICODE 384070 384554 684 0.0 3840704.6 36.4X +UNICODE_CI 441935 442184 352 0.0 4419351.4 41.9X diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt index a63b80f005ed0..c79fef5f4cf65 100644 --- a/sql/core/benchmarks/CollationBenchmark-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-results.txt @@ -1,54 +1,54 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 1372 1372 1 0.1 13718.5 1.0X -UTF8_LCASE 3115 3116 1 0.0 31154.4 2.3X -UNICODE 19813 19820 9 0.0 198132.2 14.4X -UNICODE_CI 19669 19686 24 0.0 196694.2 14.3X +UTF8_BINARY 1372 1372 1 0.1 13715.2 1.0X +UTF8_LCASE 3847 3851 6 0.0 38467.3 2.8X +UNICODE 19659 19662 4 0.0 196587.1 14.3X +UNICODE_CI 19663 19666 3 0.0 196634.5 14.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 1727 1728 1 0.1 17271.3 1.0X -UTF8_LCASE 3034 3035 1 0.0 30337.2 1.8X -UNICODE 19230 19243 18 0.0 192301.2 11.1X -UNICODE_CI 19080 19082 3 0.0 190802.0 11.0X +UTF8_BINARY 1706 1707 3 0.1 17056.0 1.0X +UTF8_LCASE 4016 4016 0 0.0 40164.0 2.4X +UNICODE 19545 19547 3 0.0 195453.4 11.5X +UNICODE_CI 19544 19547 5 0.0 195437.5 11.5X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 3080 3080 0 0.0 30796.4 1.0X -UTF8_LCASE 6436 6454 25 0.0 64360.0 2.1X -UNICODE 68095 68167 101 0.0 680951.3 22.1X -UNICODE_CI 62122 62123 2 0.0 621215.8 20.2X +UTF8_BINARY 3091 3092 1 0.0 30909.8 1.0X +UTF8_LCASE 6286 6287 2 0.0 62856.0 2.0X +UNICODE 65495 65528 47 0.0 654945.7 21.2X +UNICODE_CI 59987 59994 10 0.0 599868.6 19.4X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 8260 8261 1 0.0 82604.0 1.0X -UTF8_LCASE 23629 23629 0 0.0 236286.4 2.9X -UNICODE 364843 366078 1747 0.0 3648427.9 44.2X -UNICODE_CI 425728 426449 1020 0.0 4257275.1 51.5X +UTF8_BINARY 13707 13726 27 0.0 137069.4 1.0X +UTF8_LCASE 28660 28685 36 0.0 286598.9 2.1X +UNICODE 363134 364168 1462 0.0 3631341.3 26.5X +UNICODE_CI 412158 412229 100 0.0 4121577.8 30.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 6844 6848 5 0.0 68440.4 1.0X -UTF8_LCASE 21849 21870 30 0.0 218486.3 3.2X -UNICODE 363474 363811 476 0.0 3634738.4 53.1X -UNICODE_CI 427563 428029 659 0.0 4275629.8 62.5X +UTF8_BINARY 12200 12205 8 0.0 121998.8 1.0X +UTF8_LCASE 27626 27633 9 0.0 276263.6 2.3X +UNICODE 350755 351083 464 0.0 3507553.8 28.8X +UNICODE_CI 409383 410380 1410 0.0 4093834.8 33.6X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 6904 6907 4 0.0 69039.3 1.0X -UTF8_LCASE 22007 22009 3 0.0 220067.8 3.2X -UNICODE 376402 377858 2060 0.0 3764015.4 54.5X -UNICODE_CI 444485 444809 458 0.0 4444850.8 64.4X +UTF8_BINARY 11879 11887 12 0.0 118786.3 1.0X +UTF8_LCASE 27743 27759 22 0.0 277434.4 2.3X +UNICODE 368435 368478 61 0.0 3684351.2 31.0X +UNICODE_CI 426350 426503 216 0.0 4263497.6 35.9X diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt index 574e3c5359100..5fd9e5c0dd084 100644 --- a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt @@ -1,54 +1,54 @@ -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 165 165 0 0.2 4118.0 1.0X -UTF8_LCASE 6996 7019 33 0.0 174899.5 42.5X -UNICODE 5395 5407 18 0.0 134874.5 32.8X -UNICODE_CI 5670 5672 2 0.0 141756.7 34.4X +UTF8_BINARY 171 171 0 0.2 4273.2 1.0X +UTF8_LCASE 7382 7393 16 0.0 184558.6 43.2X +UNICODE 5337 5342 7 0.0 133424.6 31.2X +UNICODE_CI 5090 5093 4 0.0 127259.3 29.8X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 306 306 0 0.1 7656.1 1.0X -UTF8_LCASE 6950 6957 11 0.0 173739.0 22.7X -UNICODE 5120 5123 3 0.0 128010.6 16.7X -UNICODE_CI 5080 5099 27 0.0 127011.6 16.6X +UTF8_BINARY 318 319 1 0.1 7952.7 1.0X +UTF8_LCASE 7065 7072 10 0.0 176621.0 22.2X +UNICODE 5294 5297 4 0.0 132357.7 16.6X +UNICODE_CI 5246 5248 2 0.0 131156.9 16.5X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 384 384 1 0.1 9591.1 1.0X -UTF8_LCASE 3549 3550 2 0.0 88721.7 9.3X -UNICODE 14143 14145 3 0.0 353570.2 36.9X -UNICODE_CI 11925 11929 6 0.0 298126.4 31.1X +UTF8_BINARY 385 387 2 0.1 9635.7 1.0X +UTF8_LCASE 3559 3560 1 0.0 88974.0 9.2X +UNICODE 15525 15542 25 0.0 388119.0 40.3X +UNICODE_CI 12479 12482 5 0.0 311967.1 32.4X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 1375 1376 1 0.0 34375.4 1.0X -UTF8_LCASE 8740 8744 6 0.0 218504.1 6.4X -UNICODE 68707 68818 158 0.0 1717667.1 50.0X -UNICODE_CI 77167 77197 42 0.0 1929168.6 56.1X +UTF8_BINARY 1669 1671 2 0.0 41724.7 1.0X +UTF8_LCASE 10199 10202 4 0.0 254974.8 6.1X +UNICODE 68754 68758 6 0.0 1718852.9 41.2X +UNICODE_CI 76056 76086 42 0.0 1901404.1 45.6X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 1064 1065 2 0.0 26587.9 1.0X -UTF8_LCASE 5820 5827 10 0.0 145506.0 5.5X -UNICODE 67636 67675 54 0.0 1690904.3 63.6X -UNICODE_CI 77750 77796 65 0.0 1943738.2 73.1X +UTF8_BINARY 1785 1787 3 0.0 44623.9 1.0X +UTF8_LCASE 6859 6861 3 0.0 171482.2 3.8X +UNICODE 68545 68553 12 0.0 1713617.2 38.4X +UNICODE_CI 76486 76520 48 0.0 1912155.4 42.9X -OpenJDK 64-Bit Server VM 21.0.4+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 21.0.5+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 1090 1091 0 0.0 27260.9 1.0X -UTF8_LCASE 6049 6054 7 0.0 151221.3 5.5X -UNICODE 74589 74633 62 0.0 1864725.7 68.4X -UNICODE_CI 83674 83708 49 0.0 2091841.0 76.7X +UTF8_BINARY 1760 1761 1 0.0 44004.8 1.0X +UTF8_LCASE 6680 6681 1 0.0 166995.3 3.8X +UNICODE 74294 74346 74 0.0 1857344.0 42.2X +UNICODE_CI 81377 81447 98 0.0 2034434.9 46.2X diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt index d4e70f29c245b..101d8c3a614dc 100644 --- a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt @@ -1,54 +1,54 @@ -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 133 133 1 0.3 3317.1 1.0X -UTF8_LCASE 7092 7097 6 0.0 177310.9 53.5X -UNICODE 5946 5966 29 0.0 148638.1 44.8X -UNICODE_CI 5715 5717 2 0.0 142885.1 43.1X +UTF8_BINARY 125 128 6 0.3 3119.4 1.0X +UTF8_LCASE 6635 6701 93 0.0 165887.1 53.2X +UNICODE 5195 5217 31 0.0 129878.9 41.6X +UNICODE_CI 5240 5263 33 0.0 131003.7 42.0X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY 433 435 2 0.1 10816.6 1.0X -UTF8_LCASE 7365 7369 5 0.0 184135.4 17.0X -UNICODE 5785 5790 7 0.0 144616.9 13.4X -UNICODE_CI 5742 5744 3 0.0 143557.1 13.3X +UTF8_BINARY 416 419 2 0.1 10410.2 1.0X +UTF8_LCASE 6909 6910 2 0.0 172713.2 16.6X +UNICODE 5444 5480 50 0.0 136112.1 13.1X +UNICODE_CI 5444 5470 37 0.0 136087.6 13.1X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 410 411 1 0.1 10246.1 1.0X -UTF8_LCASE 3588 3589 1 0.0 89698.8 8.8X -UNICODE 15788 15802 20 0.0 394702.8 38.5X -UNICODE_CI 12179 12192 19 0.0 304466.6 29.7X +UTF8_BINARY 380 386 4 0.1 9510.3 1.0X +UTF8_LCASE 3390 3398 10 0.0 84756.7 8.9X +UNICODE 13224 13243 27 0.0 330604.1 34.8X +UNICODE_CI 10524 10635 157 0.0 263095.0 27.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 1367 1370 4 0.0 34182.9 1.0X -UTF8_LCASE 9644 9645 1 0.0 241101.2 7.1X -UNICODE 67169 67171 3 0.0 1679230.1 49.1X -UNICODE_CI 79077 79209 188 0.0 1976919.1 57.8X +UTF8_BINARY 1740 1758 26 0.0 43489.5 1.0X +UTF8_LCASE 10697 10708 15 0.0 267435.0 6.1X +UNICODE 61284 61521 336 0.0 1532092.7 35.2X +UNICODE_CI 70030 70051 29 0.0 1750749.7 40.3X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 1064 1067 3 0.0 26608.1 1.0X -UTF8_LCASE 6487 6491 4 0.0 162186.5 6.1X -UNICODE 68473 68523 71 0.0 1711818.5 64.3X -UNICODE_CI 79374 79419 64 0.0 1984338.0 74.6X +UTF8_BINARY 1663 1675 18 0.0 41563.3 1.0X +UTF8_LCASE 6786 6787 2 0.0 169640.2 4.1X +UNICODE 60580 60668 124 0.0 1514498.6 36.4X +UNICODE_CI 71018 71018 0 0.0 1775444.7 42.7X -OpenJDK 64-Bit Server VM 17.0.12+7-LTS on Linux 6.5.0-1025-azure +OpenJDK 64-Bit Server VM 17.0.13+11-LTS on Linux 6.5.0-1025-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative time ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY 1002 1004 2 0.0 25061.8 1.0X -UTF8_LCASE 6052 6052 0 0.0 151298.7 6.0X -UNICODE 74506 74551 64 0.0 1862644.2 74.3X -UNICODE_CI 83607 83756 211 0.0 2090164.5 83.4X +UTF8_BINARY 1778 1779 2 0.0 44450.1 1.0X +UTF8_LCASE 6786 6822 50 0.0 169657.6 3.8X +UNICODE 67562 67633 101 0.0 1689054.6 38.0X +UNICODE_CI 75378 75919 765 0.0 1884460.0 42.4X diff --git a/sql/core/src/test/resources/collations/ICU-collations-map.md b/sql/core/src/test/resources/collations/ICU-collations-map.md index a704034c694aa..6308571009bd8 100644 --- a/sql/core/src/test/resources/collations/ICU-collations-map.md +++ b/sql/core/src/test/resources/collations/ICU-collations-map.md @@ -90,55 +90,58 @@ | 85 | nl | | 86 | nn | | 87 | no | -| 88 | om | -| 89 | or | -| 90 | pa | -| 91 | pa_Guru | -| 92 | pa_Guru_IND | -| 93 | pl | -| 94 | ps | -| 95 | pt | -| 96 | ro | -| 97 | ru | -| 98 | sa | -| 99 | se | -| 100 | si | -| 101 | sk | -| 102 | sl | -| 103 | smn | -| 104 | sq | -| 105 | sr | -| 106 | sr_Cyrl | -| 107 | sr_Cyrl_BIH | -| 108 | sr_Cyrl_MNE | -| 109 | sr_Cyrl_SRB | -| 110 | sr_Latn | -| 111 | sr_Latn_BIH | -| 112 | sr_Latn_SRB | -| 113 | sv | -| 114 | sw | -| 115 | ta | -| 116 | te | -| 117 | th | -| 118 | tk | -| 119 | to | -| 120 | tr | -| 121 | ug | -| 122 | uk | -| 123 | ur | -| 124 | uz | -| 125 | vi | -| 126 | wae | -| 127 | wo | -| 128 | xh | -| 129 | yi | -| 130 | yo | -| 131 | zh | -| 132 | zh_Hans | -| 133 | zh_Hans_CHN | -| 134 | zh_Hans_SGP | -| 135 | zh_Hant | -| 136 | zh_Hant_HKG | -| 137 | zh_Hant_MAC | -| 138 | zh_Hant_TWN | -| 139 | zu | +| 88 | nso | +| 89 | om | +| 90 | or | +| 91 | pa | +| 92 | pa_Guru | +| 93 | pa_Guru_IND | +| 94 | pl | +| 95 | ps | +| 96 | pt | +| 97 | ro | +| 98 | ru | +| 99 | sa | +| 100 | se | +| 101 | si | +| 102 | sk | +| 103 | sl | +| 104 | smn | +| 105 | sq | +| 106 | sr | +| 107 | sr_Cyrl | +| 108 | sr_Cyrl_BIH | +| 109 | sr_Cyrl_MNE | +| 110 | sr_Cyrl_SRB | +| 111 | sr_Latn | +| 112 | sr_Latn_BIH | +| 113 | sr_Latn_SRB | +| 114 | st | +| 115 | sv | +| 116 | sw | +| 117 | ta | +| 118 | te | +| 119 | th | +| 120 | tk | +| 121 | tn | +| 122 | to | +| 123 | tr | +| 124 | ug | +| 125 | uk | +| 126 | ur | +| 127 | uz | +| 128 | vi | +| 129 | wae | +| 130 | wo | +| 131 | xh | +| 132 | yi | +| 133 | yo | +| 134 | zh | +| 135 | zh_Hans | +| 136 | zh_Hans_CHN | +| 137 | zh_Hans_SGP | +| 138 | zh_Hant | +| 139 | zh_Hant_HKG | +| 140 | zh_Hant_MAC | +| 141 | zh_Hant_TWN | +| 142 | zu | diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index adfc5b703da47..3563e04dced11 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -200,10 +200,10 @@ class CollationSQLExpressionsSuite Murmur3HashTestCase("Spark ", "UTF8_BINARY_RTRIM", 1779328737), Murmur3HashTestCase("Spark", "UTF8_LCASE", -1928694360), Murmur3HashTestCase("Spark ", "UTF8_LCASE_RTRIM", -1928694360), - Murmur3HashTestCase("SQL", "UNICODE", -1923567940), - Murmur3HashTestCase("SQL ", "UNICODE_RTRIM", -1923567940), - Murmur3HashTestCase("SQL", "UNICODE_CI", 1029527950), - Murmur3HashTestCase("SQL ", "UNICODE_CI_RTRIM", 1029527950) + Murmur3HashTestCase("SQL", "UNICODE", 1483684981), + Murmur3HashTestCase("SQL ", "UNICODE_RTRIM", 1483684981), + Murmur3HashTestCase("SQL", "UNICODE_CI", 279787709), + Murmur3HashTestCase("SQL ", "UNICODE_CI_RTRIM", 279787709) ) // Supported collations @@ -232,10 +232,10 @@ class CollationSQLExpressionsSuite XxHash64TestCase("Spark ", "UTF8_BINARY_RTRIM", 6480371823304753502L), XxHash64TestCase("Spark", "UTF8_LCASE", -3142112654825786434L), XxHash64TestCase("Spark ", "UTF8_LCASE_RTRIM", -3142112654825786434L), - XxHash64TestCase("SQL", "UNICODE", 5964849564945649886L), - XxHash64TestCase("SQL ", "UNICODE_RTRIM", 5964849564945649886L), - XxHash64TestCase("SQL", "UNICODE_CI", 3732497619779520590L), - XxHash64TestCase("SQL ", "UNICODE_CI_RTRIM", 3732497619779520590L) + XxHash64TestCase("SQL", "UNICODE", 7549349329256749019L), + XxHash64TestCase("SQL ", "UNICODE_RTRIM", 7549349329256749019L), + XxHash64TestCase("SQL", "UNICODE_CI", -3010409544364398863L), + XxHash64TestCase("SQL ", "UNICODE_CI_RTRIM", -3010409544364398863L) ) // Supported collations @@ -3147,7 +3147,7 @@ class CollationSQLExpressionsSuite HyperLogLogPlusPlusTestCase("utf8_lcase", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA", "aA", "Aa", "aa"), Seq(Row(5))), HyperLogLogPlusPlusTestCase("UNICODE", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA", - "aA", "Aa", "aa"), Seq(Row(10))), + "aA", "Aa", "aa"), Seq(Row(9))), HyperLogLogPlusPlusTestCase("UNICODE_CI", Seq("a", "a", "A", "z", "zz", "ZZ", "w", "AA", "aA", "Aa", "aa"), Seq(Row(5))) ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala index 3a5a650e5a0c4..d69ba77a14750 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala @@ -1979,27 +1979,28 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { // verify that the output ordering is as expected (UTF8_BINARY, UTF8_LCASE, etc.) val df = sql("SELECT * FROM collations() limit 10") + val icvVersion = "76.1.0.0" checkAnswer(df, Seq(Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null, "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null), Row("SYSTEM", "BUILTIN", "UTF8_LCASE", null, null, "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", null), Row("SYSTEM", "BUILTIN", "UNICODE", "", "", - "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "UNICODE_AI", "", "", - "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "UNICODE_CI", "", "", - "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "UNICODE_CI_AI", "", "", - "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "af", "Afrikaans", "", - "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "af_AI", "Afrikaans", "", - "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "af_CI", "Afrikaans", "", - "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "af_CI_AI", "Afrikaans", "", - "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"))) + "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion))) checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE '%UTF8_BINARY%'"), Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null, @@ -2007,34 +2008,34 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper { checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE '%zh_Hant_HKG%'"), Seq(Row("SYSTEM", "BUILTIN", "zh_Hant_HKG", "Chinese", "Hong Kong SAR China", - "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_AI", "Chinese", "Hong Kong SAR China", - "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI", "Chinese", "Hong Kong SAR China", - "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI_AI", "Chinese", "Hong Kong SAR China", - "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"))) + "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion))) checkAnswer(sql("SELECT * FROM collations() WHERE COUNTRY = 'Singapore'"), Seq(Row("SYSTEM", "BUILTIN", "zh_Hans_SGP", "Chinese", "Singapore", - "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_AI", "Chinese", "Singapore", - "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI", "Chinese", "Singapore", - "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI_AI", "Chinese", "Singapore", - "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"))) + "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion))) checkAnswer(sql("SELECT * FROM collations() WHERE LANGUAGE = 'English' " + "and COUNTRY = 'United States'"), Seq(Row("SYSTEM", "BUILTIN", "en_USA", "English", "United States", - "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "en_USA_AI", "English", "United States", - "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "en_USA_CI", "English", "United States", - "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"), + "ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion), Row("SYSTEM", "BUILTIN", "en_USA_CI_AI", "English", "United States", - "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"))) + "ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", icvVersion))) checkAnswer(sql("SELECT NAME, LANGUAGE, ACCENT_SENSITIVITY, CASE_SENSITIVITY " + "FROM collations() WHERE COUNTRY = 'United States'"),