Merge remote-tracking branch 'origin/branch-24.12' into db-14.3-strin…

…g-test
NVIDIA · Nov 4, 2024 · e7732c5 · e7732c5
2 parents 7807452 + 2e16ff2
commit e7732c5
Show file tree

Hide file tree

Showing 54 changed files with 784 additions and 1,039 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,5 @@
 # Change log
-Generated on 2024-10-18
+Generated on 2024-10-31
 
 ## Release 24.10
 
@@ -26,6 +26,7 @@ Generated on 2024-10-18
 ### Bugs Fixed
 |||
 |:---|:---|
+|[#11558](https://github.com/NVIDIA/spark-rapids/issues/11558)|[BUG] test_sortmerge_join_ridealong fails on DB 13.3|
 |[#11573](https://github.com/NVIDIA/spark-rapids/issues/11573)|[BUG] very long tail task is observed when many tasks are contending for PrioritySemaphore|
 |[#11367](https://github.com/NVIDIA/spark-rapids/issues/11367)|[BUG] Error "table_view.cpp:36: Column size mismatch" when using approx_percentile on a string column|
 |[#11543](https://github.com/NVIDIA/spark-rapids/issues/11543)|[BUG] test_yyyyMMdd_format_for_legacy_mode[DATAGEN_SEED=1727619674, TZ=UTC] failed GPU and CPU are not both null|
@@ -68,6 +69,8 @@ Generated on 2024-10-18
 ### PRs
 |||
 |:---|:---|
+|[#11676](https://github.com/NVIDIA/spark-rapids/pull/11676)| Fix race condition with Parquet filter pushdown modifying shared hadoop Configuration|
+|[#11626](https://github.com/NVIDIA/spark-rapids/pull/11626)|Update latest changelog [skip ci]|
 |[#11624](https://github.com/NVIDIA/spark-rapids/pull/11624)|Update the download link [skip ci]|
 |[#11577](https://github.com/NVIDIA/spark-rapids/pull/11577)|Update latest changelog [skip ci]|
 |[#11576](https://github.com/NVIDIA/spark-rapids/pull/11576)|Update rapids JNI and private dependency to 24.10.0|

diff --git a/build/shimplify.py b/build/shimplify.py
@@ -84,6 +84,7 @@
 import os
 import re
 import subprocess
+from functools import partial
 
 
 def __project():
@@ -199,7 +200,9 @@ def __csv_as_arr(str_val):
 __shim_comment_pattern = re.compile(re.escape(__opening_shim_tag) +
                                     r'\n(.*)\n' +
                                     re.escape(__closing_shim_tag), re.DOTALL)
-
+__spark_version_classifier = '$_spark.version.classifier_'
+__spark_version_placeholder = re.escape(__spark_version_classifier)
+__package_pattern = re.compile('package .*' + '(' + __spark_version_placeholder + ')')
 def __upsert_shim_json(filename, bv_list):
     with open(filename, 'r') as file:
         contents = file.readlines()
@@ -365,10 +368,7 @@ def __generate_symlinks():
         __log.info("# generating symlinks for shim %s %s files", buildver, src_type)
         __traverse_source_tree_of_all_shims(
             src_type,
-            lambda src_type, path, build_ver_arr: __generate_symlink_to_file(buildver,
-                                                                             src_type,
-                                                                             path,
-                                                                             build_ver_arr))
+            partial(__generate_symlink_to_file, buildver=buildver, src_type=src_type))
 
 def __traverse_source_tree_of_all_shims(src_type, func):
     """Walks src/<src_type>/sparkXYZ"""
@@ -392,11 +392,10 @@ def __traverse_source_tree_of_all_shims(src_type, func):
                 build_ver_arr = map(lambda x: str(json.loads(x).get('spark')), shim_arr)
                 __log.debug("extracted shims %s", build_ver_arr)
                 assert build_ver_arr == sorted(build_ver_arr),\
-                    "%s shim list is not properly sorted" % shim_file_path
-                func(src_type, shim_file_path, build_ver_arr)
-
+                    "%s shim list is not properly sorted: %s" % (shim_file_path, build_ver_arr)
+                func(shim_file_path=shim_file_path, build_ver_arr=build_ver_arr, shim_file_txt=shim_file_txt)
 
-def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr):
+def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr, shim_file_txt):
     if buildver in build_ver_arr:
         project_base_dir = str(__project().getBaseDir())
         base_dir = __src_basedir
@@ -416,9 +415,32 @@ def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr
         target_shim_file_path = os.path.join(target_root, target_rel_path)
         __log.debug("creating symlink %s -> %s", target_shim_file_path, shim_file_path)
         __makedirs(os.path.dirname(target_shim_file_path))
-        if __should_overwrite:
+        package_match = __package_pattern.search(shim_file_txt)
+        if __should_overwrite or package_match:
             __remove_file(target_shim_file_path)
-        __symlink(shim_file_path, target_shim_file_path)
+        if package_match:
+            with open(target_shim_file_path, mode='w') as f:
+                f.write(shim_file_txt[0:package_match.start(1)])
+                f.write("spark")
+                f.write(buildver)
+                f.write('\n')
+                f.write('''
+/*
+!!! DO NOT EDIT THIS FILE !!!
+
+This file has been generated from the original
+
+%s
+
+by interpolating $_spark.version.classifier_=%s
+
+Be sure to edit the original file if required
+
+*/
+                ''' % (shim_file_path, 'spark' + buildver))
+                f.write(shim_file_txt[package_match.end(1):])
+        else:
+            __symlink(shim_file_path, target_shim_file_path)
 
 
 def __symlink(src, target):

diff --git a/docs/archive.md b/docs/archive.md
@@ -5,6 +5,95 @@ nav_order: 15
 ---
 Below are archived releases for RAPIDS Accelerator for Apache Spark.
 
+## Release v24.10.0
+### Hardware Requirements:
+
+The plugin is tested on the following architectures:
+
+	GPU Models: NVIDIA V100, T4, A10/A100, L4 and H100 GPUs
+
+### Software Requirements:
+
+	OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8
+
+	NVIDIA Driver*: R470+
+
+	Runtime: 
+		Scala 2.12, 2.13
+		Python, Java Virtual Machine (JVM) compatible with your spark-version. 
+
+		* Check the Spark documentation for Python and Java version compatibility with your specific 
+		Spark version. For instance, visit `https://spark.apache.org/docs/3.4.1` for Spark 3.4.1.
+
+	Supported Spark versions:
+		Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4
+		Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.3.4
+		Apache Spark 3.4.0, 3.4.1, 3.4.2, 3.4.3
+		Apache Spark 3.5.0, 3.5.1, 3.5.2
+
+	Supported Databricks runtime versions for Azure and AWS:
+		Databricks 11.3 ML LTS (GPU, Scala 2.12, Spark 3.3.0)
+		Databricks 12.2 ML LTS (GPU, Scala 2.12, Spark 3.3.2)
+		Databricks 13.3 ML LTS (GPU, Scala 2.12, Spark 3.4.1)
+
+	Supported Dataproc versions (Debian/Ubuntu/Rocky):
+		GCP Dataproc 2.1
+		GCP Dataproc 2.2
+
+	Supported Dataproc Serverless versions:
+		Spark runtime 1.1 LTS
+		Spark runtime 2.0
+		Spark runtime 2.1
+		Spark runtime 2.2
+
+*Some hardware may have a minimum driver version greater than R470. Check the GPU spec sheet
+for your hardware's minimum driver version.
+
+*For Cloudera and EMR support, please refer to the
+[Distributions](https://docs.nvidia.com/spark-rapids/user-guide/latest/faq.html#which-distributions-are-supported) section of the FAQ.
+
+### RAPIDS Accelerator's Support Policy for Apache Spark
+The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html)
+
+### Download RAPIDS Accelerator for Apache Spark v24.10.0
+
+| Processor | Scala Version | Download Jar | Download Signature |
+|-----------|---------------|--------------|--------------------|
+| x86_64    | Scala 2.12    | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar.asc) |
+| x86_64    | Scala 2.13    | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar.asc) |
+| arm64     | Scala 2.12    | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar.asc) |
+| arm64     | Scala 2.13    | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar.asc) |
+
+This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with 
+CUDA 11.8 through CUDA 12.0.
+
+### Verify signature
+* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
+* Import the public key: `gpg --import PUB_KEY`
+* Verify the signature for Scala 2.12 jar:
+    `gpg --verify rapids-4-spark_2.12-24.10.0.jar.asc rapids-4-spark_2.12-24.10.0.jar`
+* Verify the signature for Scala 2.13 jar:
+    `gpg --verify rapids-4-spark_2.13-24.10.0.jar.asc rapids-4-spark_2.13-24.10.0.jar`
+
+The output of signature verify:
+
+	gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) <sw-spark@nvidia.com>"
+
+### Release Notes
+* Optimize scheduling policy for GPU Semaphore
+* Support distinct join for right outer joins
+* Support MinBy and MaxBy for non-float ordering  
+* Support ArrayJoin expression
+* Optimize Expand and Aggregate expression performance
+* Improve JSON related expressions
+* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases)
+
+Note: There is a known issue in the 24.10.0 release when decompressing gzip files on H100 GPUs.  
+Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661).
+
+For a detailed list of changes, please refer to the
+[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md).
+
 ## Release v24.08.1
 ### Hardware Requirements:
 

diff --git a/docs/compatibility.md b/docs/compatibility.md
@@ -484,17 +484,16 @@ These are the known edge cases where running on the GPU will produce different r
  next to a newline or a repetition that produces zero or more results
  ([#5610](https://github.com/NVIDIA/spark-rapids/pull/5610))`
 - Word and non-word boundaries, `\b` and `\B`
-- Line anchor `$` will incorrectly match any of the unicode characters `\u0085`, `\u2028`, or `\u2029` followed by
-  another line-terminator, such as `\n`. For example, the pattern `TEST$` will match `TEST\u0085\n` on the GPU but
-  not on the CPU ([#7585](https://github.com/NVIDIA/spark-rapids/issues/7585)).
 
 The following regular expression patterns are not yet supported on the GPU and will fall back to the CPU.
 
 - Line anchors `^` and `$` are not supported in some contexts, such as when combined with a choice (`^|a` or `$|a`).
 - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts.
-- String anchor `\z` is not supported
-- Patterns containing an end of line or string anchor immediately next to a newline or repetition that produces zero
+- String anchor `\z` is not supported.
+- Patterns containing an end-of-line or string anchor immediately next to a newline or repetition that produces zero
   or more results
+- Patterns containing end-of-line anchors like `$` or `\Z` immediately followed by 
+  escape sequences (e.g., `\w`, `\b`) are not supported.
 - Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D`
 - Line and string anchors are not supported by `string_split` and `str_to_map`
 - Lazy quantifiers within a choice block such as `(2|\u2029??)+` 

diff --git a/docs/dev/shimplify.md b/docs/dev/shimplify.md
@@ -65,7 +65,15 @@ validations:
   * The file is stored under the *owner shim* directory.
 
 * All files participating listing the `buildver` of the current Maven build session are symlinked to
-`target/${buildver}/generated/src/(main|test)/(scala|java)`. Thus, instead of hardcoding distinct
+`target/${buildver}/generated/src/(main|test)/(scala|java)`
+except for template classes requiring spark.version.classifier in the package name.
+
+* If the package name of a class such as RapidsShuffleManager contains `$_spark.version.classifier_`
+(because it is source-identical across shims up to the package name) it will be materialized in the
+`target/${buildver}/generated/src/(main|test)/(scala|java)` with `spark.version.classifier`
+interpolated into the package name.
+
+Thus, instead of hardcoding distinct
 lists of directories for `build-helper` Maven plugin to add (one for each shim) after the full
 transition to shimplify, the pom will have only 4 add source statements that is independent of the
 number of supported shims.

diff --git a/docs/download.md b/docs/download.md
@@ -18,7 +18,7 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub
 that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started
 guide](https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/overview.html) for more details.
 
-## Release v24.10.0
+## Release v24.10.1
 ### Hardware Requirements:
 
 The plugin is tested on the following architectures:
@@ -27,7 +27,8 @@ The plugin is tested on the following architectures:
 
 ### Software Requirements:
 
-	OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8
+    OS: Spark RAPIDS is compatible with any Linux distribution with glibc >= 2.28 (Please check ldd --version output).  glibc 2.28 was released August 1, 2018. 
+    Tested on Ubuntu 20.04, Ubuntu 22.04, Rocky Linux 8 and Rocky Linux 9
 
 	NVIDIA Driver*: R470+
 
@@ -68,14 +69,14 @@ for your hardware's minimum driver version.
 ### RAPIDS Accelerator's Support Policy for Apache Spark
 The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html)
 
-### Download RAPIDS Accelerator for Apache Spark v24.10.0
+### Download RAPIDS Accelerator for Apache Spark v24.10.1
 
 | Processor | Scala Version | Download Jar | Download Signature |
 |-----------|---------------|--------------|--------------------|
-| x86_64    | Scala 2.12    | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar.asc) |
-| x86_64    | Scala 2.13    | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar.asc) |
-| arm64     | Scala 2.12    | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar.asc) |
-| arm64     | Scala 2.13    | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar.asc) |
+| x86_64    | Scala 2.12    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar.asc) |
+| x86_64    | Scala 2.13    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar.asc) |
+| arm64     | Scala 2.12    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar.asc) |
+| arm64     | Scala 2.13    | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar.asc) |
 
 This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with 
 CUDA 11.8 through CUDA 12.0.
@@ -84,9 +85,9 @@ CUDA 11.8 through CUDA 12.0.
 * Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
 * Import the public key: `gpg --import PUB_KEY`
 * Verify the signature for Scala 2.12 jar:
-    `gpg --verify rapids-4-spark_2.12-24.10.0.jar.asc rapids-4-spark_2.12-24.10.0.jar`
+    `gpg --verify rapids-4-spark_2.12-24.10.1.jar.asc rapids-4-spark_2.12-24.10.1.jar`
 * Verify the signature for Scala 2.13 jar:
-    `gpg --verify rapids-4-spark_2.13-24.10.0.jar.asc rapids-4-spark_2.13-24.10.0.jar`
+    `gpg --verify rapids-4-spark_2.13-24.10.1.jar.asc rapids-4-spark_2.13-24.10.1.jar`
 
 The output of signature verify:
 
@@ -101,7 +102,7 @@ The output of signature verify:
 * Improve JSON related expressions
 * For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases)
 
-Note: There is a known issue in the 24.10.0 release when decompressing gzip files on H100 GPUs.  
+Note: There is a known issue in the 24.10.1 release when decompressing gzip files on H100 GPUs.  
 Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661).
 
 For a detailed list of changes, please refer to the

diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py
@@ -19,7 +19,7 @@
 from conftest import is_databricks_runtime, is_not_utc
 from data_gen import *
 from marks import ignore_order, allow_non_gpu
-from spark_session import with_cpu_session, is_databricks113_or_later, is_before_spark_330
+from spark_session import with_cpu_session, is_databricks113_or_later, is_before_spark_330, is_databricks_version_or_later
 
 # allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'
 not_utc_aqe_allow=['ShuffleExchangeExec', 'HashAggregateExec'] if is_not_utc() else []
@@ -340,6 +340,8 @@ def do_it(spark):
 aqe_join_with_dpp_fallback=["FilterExec"] if (is_databricks_runtime() or is_before_spark_330()) else []
 
 # Verify that DPP and AQE can coexist in even some odd cases involving multiple tables
+@pytest.mark.skipif(condition=is_databricks_version_or_later(14, 3),
+                    reason="https://github.com/NVIDIA/spark-rapids/issues/11643")
 @ignore_order(local=True)
 @allow_non_gpu(*aqe_join_with_dpp_fallback)
 def test_aqe_join_with_dpp(spark_tmp_path):
@@ -393,6 +395,8 @@ def run_test(spark):
     assert_gpu_and_cpu_are_equal_collect(run_test, conf=_adaptive_conf)
 
 # Verify that DPP and AQE can coexist in even some odd cases involving 2 tables with multiple columns
+@pytest.mark.skipif(condition=is_databricks_version_or_later(14, 3),
+                    reason="https://github.com/NVIDIA/spark-rapids/issues/11643")
 @ignore_order(local=True)
 @allow_non_gpu(*aqe_join_with_dpp_fallback)
 def test_aqe_join_with_dpp_multi_columns(spark_tmp_path):

diff --git a/integration_tests/src/main/python/misc_expr_test.py b/integration_tests/src/main/python/misc_expr_test.py
@@ -19,7 +19,7 @@
 from marks import incompat, approximate_float
 from pyspark.sql.types import *
 import pyspark.sql.functions as f
-from spark_session import is_before_spark_400
+from spark_session import is_databricks_version_or_later, is_spark_400_or_later
 
 def test_mono_id():
     assert_gpu_and_cpu_are_equal_collect(
@@ -34,8 +34,8 @@ def test_part_id():
                 f.spark_partition_id()))
 
 
-@pytest.mark.skipif(condition=not is_before_spark_400(),
-                    reason="raise_error() not currently implemented for Spark 4.0. "
+@pytest.mark.skipif(condition=is_spark_400_or_later() or is_databricks_version_or_later(14, 3),
+                    reason="raise_error() not currently implemented for Spark 4.0, or Databricks 14.3. "
                            "See https://github.com/NVIDIA/spark-rapids/issues/10107.")
 def test_raise_error():
     data_gen = ShortGen(nullable=False, min_val=0, max_val=20, special_cases=[])