Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/branch-24.12' into db-14.3-strin…
Browse files Browse the repository at this point in the history
…g-test
  • Loading branch information
mythrocks committed Nov 4, 2024
2 parents 7807452 + 2e16ff2 commit e7732c5
Show file tree
Hide file tree
Showing 54 changed files with 784 additions and 1,039 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Change log
Generated on 2024-10-18
Generated on 2024-10-31

## Release 24.10

Expand All @@ -26,6 +26,7 @@ Generated on 2024-10-18
### Bugs Fixed
|||
|:---|:---|
|[#11558](https://github.com/NVIDIA/spark-rapids/issues/11558)|[BUG] test_sortmerge_join_ridealong fails on DB 13.3|
|[#11573](https://github.com/NVIDIA/spark-rapids/issues/11573)|[BUG] very long tail task is observed when many tasks are contending for PrioritySemaphore|
|[#11367](https://github.com/NVIDIA/spark-rapids/issues/11367)|[BUG] Error "table_view.cpp:36: Column size mismatch" when using approx_percentile on a string column|
|[#11543](https://github.com/NVIDIA/spark-rapids/issues/11543)|[BUG] test_yyyyMMdd_format_for_legacy_mode[DATAGEN_SEED=1727619674, TZ=UTC] failed GPU and CPU are not both null|
Expand Down Expand Up @@ -68,6 +69,8 @@ Generated on 2024-10-18
### PRs
|||
|:---|:---|
|[#11676](https://github.com/NVIDIA/spark-rapids/pull/11676)| Fix race condition with Parquet filter pushdown modifying shared hadoop Configuration|
|[#11626](https://github.com/NVIDIA/spark-rapids/pull/11626)|Update latest changelog [skip ci]|
|[#11624](https://github.com/NVIDIA/spark-rapids/pull/11624)|Update the download link [skip ci]|
|[#11577](https://github.com/NVIDIA/spark-rapids/pull/11577)|Update latest changelog [skip ci]|
|[#11576](https://github.com/NVIDIA/spark-rapids/pull/11576)|Update rapids JNI and private dependency to 24.10.0|
Expand Down
44 changes: 33 additions & 11 deletions build/shimplify.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
import os
import re
import subprocess
from functools import partial


def __project():
Expand Down Expand Up @@ -199,7 +200,9 @@ def __csv_as_arr(str_val):
__shim_comment_pattern = re.compile(re.escape(__opening_shim_tag) +
r'\n(.*)\n' +
re.escape(__closing_shim_tag), re.DOTALL)

__spark_version_classifier = '$_spark.version.classifier_'
__spark_version_placeholder = re.escape(__spark_version_classifier)
__package_pattern = re.compile('package .*' + '(' + __spark_version_placeholder + ')')
def __upsert_shim_json(filename, bv_list):
with open(filename, 'r') as file:
contents = file.readlines()
Expand Down Expand Up @@ -365,10 +368,7 @@ def __generate_symlinks():
__log.info("# generating symlinks for shim %s %s files", buildver, src_type)
__traverse_source_tree_of_all_shims(
src_type,
lambda src_type, path, build_ver_arr: __generate_symlink_to_file(buildver,
src_type,
path,
build_ver_arr))
partial(__generate_symlink_to_file, buildver=buildver, src_type=src_type))

def __traverse_source_tree_of_all_shims(src_type, func):
"""Walks src/<src_type>/sparkXYZ"""
Expand All @@ -392,11 +392,10 @@ def __traverse_source_tree_of_all_shims(src_type, func):
build_ver_arr = map(lambda x: str(json.loads(x).get('spark')), shim_arr)
__log.debug("extracted shims %s", build_ver_arr)
assert build_ver_arr == sorted(build_ver_arr),\
"%s shim list is not properly sorted" % shim_file_path
func(src_type, shim_file_path, build_ver_arr)

"%s shim list is not properly sorted: %s" % (shim_file_path, build_ver_arr)
func(shim_file_path=shim_file_path, build_ver_arr=build_ver_arr, shim_file_txt=shim_file_txt)

def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr):
def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr, shim_file_txt):
if buildver in build_ver_arr:
project_base_dir = str(__project().getBaseDir())
base_dir = __src_basedir
Expand All @@ -416,9 +415,32 @@ def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr
target_shim_file_path = os.path.join(target_root, target_rel_path)
__log.debug("creating symlink %s -> %s", target_shim_file_path, shim_file_path)
__makedirs(os.path.dirname(target_shim_file_path))
if __should_overwrite:
package_match = __package_pattern.search(shim_file_txt)
if __should_overwrite or package_match:
__remove_file(target_shim_file_path)
__symlink(shim_file_path, target_shim_file_path)
if package_match:
with open(target_shim_file_path, mode='w') as f:
f.write(shim_file_txt[0:package_match.start(1)])
f.write("spark")
f.write(buildver)
f.write('\n')
f.write('''
/*
!!! DO NOT EDIT THIS FILE !!!
This file has been generated from the original
%s
by interpolating $_spark.version.classifier_=%s
Be sure to edit the original file if required
*/
''' % (shim_file_path, 'spark' + buildver))
f.write(shim_file_txt[package_match.end(1):])
else:
__symlink(shim_file_path, target_shim_file_path)


def __symlink(src, target):
Expand Down
89 changes: 89 additions & 0 deletions docs/archive.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,95 @@ nav_order: 15
---
Below are archived releases for RAPIDS Accelerator for Apache Spark.

## Release v24.10.0
### Hardware Requirements:

The plugin is tested on the following architectures:

GPU Models: NVIDIA V100, T4, A10/A100, L4 and H100 GPUs

### Software Requirements:

OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8

NVIDIA Driver*: R470+

Runtime:
Scala 2.12, 2.13
Python, Java Virtual Machine (JVM) compatible with your spark-version.

* Check the Spark documentation for Python and Java version compatibility with your specific
Spark version. For instance, visit `https://spark.apache.org/docs/3.4.1` for Spark 3.4.1.

Supported Spark versions:
Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4
Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.3.4
Apache Spark 3.4.0, 3.4.1, 3.4.2, 3.4.3
Apache Spark 3.5.0, 3.5.1, 3.5.2

Supported Databricks runtime versions for Azure and AWS:
Databricks 11.3 ML LTS (GPU, Scala 2.12, Spark 3.3.0)
Databricks 12.2 ML LTS (GPU, Scala 2.12, Spark 3.3.2)
Databricks 13.3 ML LTS (GPU, Scala 2.12, Spark 3.4.1)

Supported Dataproc versions (Debian/Ubuntu/Rocky):
GCP Dataproc 2.1
GCP Dataproc 2.2

Supported Dataproc Serverless versions:
Spark runtime 1.1 LTS
Spark runtime 2.0
Spark runtime 2.1
Spark runtime 2.2

*Some hardware may have a minimum driver version greater than R470. Check the GPU spec sheet
for your hardware's minimum driver version.

*For Cloudera and EMR support, please refer to the
[Distributions](https://docs.nvidia.com/spark-rapids/user-guide/latest/faq.html#which-distributions-are-supported) section of the FAQ.

### RAPIDS Accelerator's Support Policy for Apache Spark
The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html)

### Download RAPIDS Accelerator for Apache Spark v24.10.0

| Processor | Scala Version | Download Jar | Download Signature |
|-----------|---------------|--------------|--------------------|
| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar.asc) |
| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar.asc) |
| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar.asc) |
| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar.asc) |

This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with
CUDA 11.8 through CUDA 12.0.

### Verify signature
* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
* Import the public key: `gpg --import PUB_KEY`
* Verify the signature for Scala 2.12 jar:
`gpg --verify rapids-4-spark_2.12-24.10.0.jar.asc rapids-4-spark_2.12-24.10.0.jar`
* Verify the signature for Scala 2.13 jar:
`gpg --verify rapids-4-spark_2.13-24.10.0.jar.asc rapids-4-spark_2.13-24.10.0.jar`

The output of signature verify:

gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) <sw-spark@nvidia.com>"

### Release Notes
* Optimize scheduling policy for GPU Semaphore
* Support distinct join for right outer joins
* Support MinBy and MaxBy for non-float ordering
* Support ArrayJoin expression
* Optimize Expand and Aggregate expression performance
* Improve JSON related expressions
* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases)

Note: There is a known issue in the 24.10.0 release when decompressing gzip files on H100 GPUs.
Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661).

For a detailed list of changes, please refer to the
[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md).

## Release v24.08.1
### Hardware Requirements:

Expand Down
9 changes: 4 additions & 5 deletions docs/compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -484,17 +484,16 @@ These are the known edge cases where running on the GPU will produce different r
next to a newline or a repetition that produces zero or more results
([#5610](https://github.com/NVIDIA/spark-rapids/pull/5610))`
- Word and non-word boundaries, `\b` and `\B`
- Line anchor `$` will incorrectly match any of the unicode characters `\u0085`, `\u2028`, or `\u2029` followed by
another line-terminator, such as `\n`. For example, the pattern `TEST$` will match `TEST\u0085\n` on the GPU but
not on the CPU ([#7585](https://github.com/NVIDIA/spark-rapids/issues/7585)).

The following regular expression patterns are not yet supported on the GPU and will fall back to the CPU.

- Line anchors `^` and `$` are not supported in some contexts, such as when combined with a choice (`^|a` or `$|a`).
- String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts.
- String anchor `\z` is not supported
- Patterns containing an end of line or string anchor immediately next to a newline or repetition that produces zero
- String anchor `\z` is not supported.
- Patterns containing an end-of-line or string anchor immediately next to a newline or repetition that produces zero
or more results
- Patterns containing end-of-line anchors like `$` or `\Z` immediately followed by
escape sequences (e.g., `\w`, `\b`) are not supported.
- Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D`
- Line and string anchors are not supported by `string_split` and `str_to_map`
- Lazy quantifiers within a choice block such as `(2|\u2029??)+`
Expand Down
10 changes: 9 additions & 1 deletion docs/dev/shimplify.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,15 @@ validations:
* The file is stored under the *owner shim* directory.

* All files participating listing the `buildver` of the current Maven build session are symlinked to
`target/${buildver}/generated/src/(main|test)/(scala|java)`. Thus, instead of hardcoding distinct
`target/${buildver}/generated/src/(main|test)/(scala|java)`
except for template classes requiring spark.version.classifier in the package name.

* If the package name of a class such as RapidsShuffleManager contains `$_spark.version.classifier_`
(because it is source-identical across shims up to the package name) it will be materialized in the
`target/${buildver}/generated/src/(main|test)/(scala|java)` with `spark.version.classifier`
interpolated into the package name.

Thus, instead of hardcoding distinct
lists of directories for `build-helper` Maven plugin to add (one for each shim) after the full
transition to shimplify, the pom will have only 4 add source statements that is independent of the
number of supported shims.
Expand Down
21 changes: 11 additions & 10 deletions docs/download.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub
that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started
guide](https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/overview.html) for more details.

## Release v24.10.0
## Release v24.10.1
### Hardware Requirements:

The plugin is tested on the following architectures:
Expand All @@ -27,7 +27,8 @@ The plugin is tested on the following architectures:

### Software Requirements:

OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8
OS: Spark RAPIDS is compatible with any Linux distribution with glibc >= 2.28 (Please check ldd --version output). glibc 2.28 was released August 1, 2018.
Tested on Ubuntu 20.04, Ubuntu 22.04, Rocky Linux 8 and Rocky Linux 9

NVIDIA Driver*: R470+

Expand Down Expand Up @@ -68,14 +69,14 @@ for your hardware's minimum driver version.
### RAPIDS Accelerator's Support Policy for Apache Spark
The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html)

### Download RAPIDS Accelerator for Apache Spark v24.10.0
### Download RAPIDS Accelerator for Apache Spark v24.10.1

| Processor | Scala Version | Download Jar | Download Signature |
|-----------|---------------|--------------|--------------------|
| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0.jar.asc) |
| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0.jar.asc) |
| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.0/rapids-4-spark_2.12-24.10.0-cuda11-arm64.jar.asc) |
| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.0/rapids-4-spark_2.13-24.10.0-cuda11-arm64.jar.asc) |
| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1.jar.asc) |
| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1.jar.asc) |
| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.10.1/rapids-4-spark_2.12-24.10.1-cuda11-arm64.jar.asc) |
| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.10.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.10.1/rapids-4-spark_2.13-24.10.1-cuda11-arm64.jar.asc) |

This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with
CUDA 11.8 through CUDA 12.0.
Expand All @@ -84,9 +85,9 @@ CUDA 11.8 through CUDA 12.0.
* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
* Import the public key: `gpg --import PUB_KEY`
* Verify the signature for Scala 2.12 jar:
`gpg --verify rapids-4-spark_2.12-24.10.0.jar.asc rapids-4-spark_2.12-24.10.0.jar`
`gpg --verify rapids-4-spark_2.12-24.10.1.jar.asc rapids-4-spark_2.12-24.10.1.jar`
* Verify the signature for Scala 2.13 jar:
`gpg --verify rapids-4-spark_2.13-24.10.0.jar.asc rapids-4-spark_2.13-24.10.0.jar`
`gpg --verify rapids-4-spark_2.13-24.10.1.jar.asc rapids-4-spark_2.13-24.10.1.jar`

The output of signature verify:

Expand All @@ -101,7 +102,7 @@ The output of signature verify:
* Improve JSON related expressions
* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases)

Note: There is a known issue in the 24.10.0 release when decompressing gzip files on H100 GPUs.
Note: There is a known issue in the 24.10.1 release when decompressing gzip files on H100 GPUs.
Please find more details in [issue-16661](https://github.com/rapidsai/cudf/issues/16661).

For a detailed list of changes, please refer to the
Expand Down
6 changes: 5 additions & 1 deletion integration_tests/src/main/python/aqe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from conftest import is_databricks_runtime, is_not_utc
from data_gen import *
from marks import ignore_order, allow_non_gpu
from spark_session import with_cpu_session, is_databricks113_or_later, is_before_spark_330
from spark_session import with_cpu_session, is_databricks113_or_later, is_before_spark_330, is_databricks_version_or_later

# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'
not_utc_aqe_allow=['ShuffleExchangeExec', 'HashAggregateExec'] if is_not_utc() else []
Expand Down Expand Up @@ -340,6 +340,8 @@ def do_it(spark):
aqe_join_with_dpp_fallback=["FilterExec"] if (is_databricks_runtime() or is_before_spark_330()) else []

# Verify that DPP and AQE can coexist in even some odd cases involving multiple tables
@pytest.mark.skipif(condition=is_databricks_version_or_later(14, 3),
reason="https://github.com/NVIDIA/spark-rapids/issues/11643")
@ignore_order(local=True)
@allow_non_gpu(*aqe_join_with_dpp_fallback)
def test_aqe_join_with_dpp(spark_tmp_path):
Expand Down Expand Up @@ -393,6 +395,8 @@ def run_test(spark):
assert_gpu_and_cpu_are_equal_collect(run_test, conf=_adaptive_conf)

# Verify that DPP and AQE can coexist in even some odd cases involving 2 tables with multiple columns
@pytest.mark.skipif(condition=is_databricks_version_or_later(14, 3),
reason="https://github.com/NVIDIA/spark-rapids/issues/11643")
@ignore_order(local=True)
@allow_non_gpu(*aqe_join_with_dpp_fallback)
def test_aqe_join_with_dpp_multi_columns(spark_tmp_path):
Expand Down
6 changes: 3 additions & 3 deletions integration_tests/src/main/python/misc_expr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from marks import incompat, approximate_float
from pyspark.sql.types import *
import pyspark.sql.functions as f
from spark_session import is_before_spark_400
from spark_session import is_databricks_version_or_later, is_spark_400_or_later

def test_mono_id():
assert_gpu_and_cpu_are_equal_collect(
Expand All @@ -34,8 +34,8 @@ def test_part_id():
f.spark_partition_id()))


@pytest.mark.skipif(condition=not is_before_spark_400(),
reason="raise_error() not currently implemented for Spark 4.0. "
@pytest.mark.skipif(condition=is_spark_400_or_later() or is_databricks_version_or_later(14, 3),
reason="raise_error() not currently implemented for Spark 4.0, or Databricks 14.3. "
"See https://github.com/NVIDIA/spark-rapids/issues/10107.")
def test_raise_error():
data_gen = ShortGen(nullable=False, min_val=0, max_val=20, special_cases=[])
Expand Down
Loading

0 comments on commit e7732c5

Please sign in to comment.