diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index fc0959c5a415..3117872e2168 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -62,6 +62,8 @@ jobs: image_docs_url_link: ${{ steps.infra-image-link.outputs.image_docs_url_link }} image_lint_url: ${{ steps.infra-image-lint-outputs.outputs.image_lint_url }} image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }} + image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }} + image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }} steps: - name: Checkout Spark repository uses: actions/checkout@v4 @@ -154,6 +156,14 @@ jobs: IMG_NAME="apache-spark-ci-image-lint:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_lint_url=$IMG_URL" >> $GITHUB_OUTPUT + - name: Generate infra image URL (SparkR) + id: infra-image-sparkr-outputs + run: | + # Convert to lowercase to meet Docker repo name requirement + REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}" + IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" + echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Link the docker images id: infra-image-link run: | @@ -162,9 +172,11 @@ jobs: if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT + echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT else echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT + echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT fi # Build: build Spark and run the tests for specified modules. @@ -405,6 +417,17 @@ jobs: ${{ needs.precondition.outputs.image_lint_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-lint-cache:${{ inputs.branch }} + - name: Build and push (SparkR) + if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + id: docker_build_sparkr + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/sparkr/ + push: true + tags: | + ${{ needs.precondition.outputs.image_sparkr_url }} + # Use the infra image cache to speed up + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }} pyspark: @@ -564,7 +587,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 180 container: - image: ${{ needs.precondition.outputs.image_url }} + image: ${{ needs.precondition.outputs.image_sparkr_url_link }} env: HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 @@ -671,8 +694,12 @@ jobs: run: | python3.11 -m pip install 'black==23.9.1' 'protobuf==5.28.3' 'mypy==1.8.0' 'mypy-protobuf==3.3.0' python3.11 -m pip list - - name: Python CodeGen check + - name: Python CodeGen check for branch-3.5 + if: inputs.branch == 'branch-3.5' run: ./dev/connect-check-protos.py + - name: Python CodeGen check + if: inputs.branch != 'branch-3.5' + run: ./dev/check-protos.py # Static analysis lint: diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml index b82d0633b0ce..a6beacedeebd 100644 --- a/.github/workflows/build_infra_images_cache.yml +++ b/.github/workflows/build_infra_images_cache.yml @@ -29,6 +29,7 @@ on: - 'dev/infra/Dockerfile' - 'dev/spark-test-image/docs/Dockerfile' - 'dev/spark-test-image/lint/Dockerfile' + - 'dev/spark-test-image/sparkr/Dockerfile' - '.github/workflows/build_infra_images_cache.yml' # Create infra image when cutting down branches/tags create: @@ -88,3 +89,16 @@ jobs: - name: Image digest (Linter) if: hashFiles('dev/spark-test-image/lint/Dockerfile') != '' run: echo ${{ steps.docker_build_lint.outputs.digest }} + - name: Build and push (SparkR) + if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + id: docker_build_sparkr + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/sparkr/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ github.ref_name }},mode=max + - name: Image digest (SparkR) + if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' + run: echo ${{ steps.docker_build_sparkr.outputs.digest }} diff --git a/.github/workflows/build_python_3.11_macos.yml b/.github/workflows/build_python_3.11_macos.yml new file mode 100644 index 000000000000..4caae55b5fea --- /dev/null +++ b/.github/workflows/build_python_3.11_macos.yml @@ -0,0 +1,32 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (master, Python 3.11, MacOS)" + +on: + schedule: + - cron: '0 21 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/python_macos_test.yml + if: github.repository == 'apache/spark' diff --git a/.github/workflows/python_macos_test.yml b/.github/workflows/python_macos_test.yml new file mode 100644 index 000000000000..cca133dab541 --- /dev/null +++ b/.github/workflows/python_macos_test.yml @@ -0,0 +1,162 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: Build and test PySpark on macOS + +on: + workflow_call: + inputs: + java: + required: false + type: string + default: 17 + python: + required: false + type: string + default: 3.11 + branch: + description: Branch to run the build against + required: false + type: string + default: master + hadoop: + description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it. + required: false + type: string + default: hadoop3 + envs: + description: Additional environment variables to set when running the tests. Should be in JSON format. + required: false + type: string + default: '{}' +jobs: + build: + name: "PySpark test on macos: ${{ matrix.modules }}" + runs-on: macos-15 + strategy: + fail-fast: false + matrix: + java: + - ${{ inputs.java }} + python: + - ${{inputs.python}} + modules: + - >- + pyspark-sql, pyspark-resource, pyspark-testing + - >- + pyspark-core, pyspark-errors, pyspark-streaming + - >- + pyspark-mllib, pyspark-ml, pyspark-ml-connect + - >- + pyspark-connect + - >- + pyspark-pandas + - >- + pyspark-pandas-slow + - >- + pyspark-pandas-connect-part0 + - >- + pyspark-pandas-connect-part1 + - >- + pyspark-pandas-connect-part2 + - >- + pyspark-pandas-connect-part3 + env: + MODULES_TO_TEST: ${{ matrix.modules }} + PYTHON_TO_TEST: python${{inputs.python}} + HADOOP_PROFILE: ${{ inputs.hadoop }} + HIVE_PROFILE: hive2.3 + # GitHub Actions' default miniconda to use in pip packaging test. + CONDA_PREFIX: /usr/share/miniconda + GITHUB_PREV_SHA: ${{ github.event.before }} + SPARK_LOCAL_IP: localhost + SKIP_UNIDOC: true + SKIP_MIMA: true + SKIP_PACKAGING: true + METASPACE_SIZE: 1g + BRANCH: ${{ inputs.branch }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v4 + # In order to fetch changed files + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + # Cache local repositories. Note that GitHub Actions cache has a 10G limit. + - name: Cache SBT and Maven + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Coursier local repository + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + pyspark-coursier- + - name: Install Java ${{ matrix.java }} + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: ${{ matrix.java }} + - name: Install Python packages (Python ${{matrix.python}}) + run: | + python${{matrix.python}} -m pip install --ignore-installed 'blinker>=1.6.2' + python${{matrix.python}} -m pip install --ignore-installed 'six==1.16.0' + python${{matrix.python}} -m pip install numpy 'pyarrow>=15.0.0' 'six==1.16.0' 'pandas==2.2.3' scipy 'plotly>=4.8' 'mlflow>=2.8.1' coverage matplotlib openpyxl 'memory-profiler>=0.61.0' 'scikit-learn>=1.3.2' unittest-xml-reporting && \ + python${{matrix.python}} -m pip install 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' 'googleapis-common-protos==1.65.0' 'graphviz==0.20.3' && \ + python${{matrix.python}} -m pip cache purge && \ + python${{matrix.python}} -m pip list + # Run the tests. + - name: Run tests + env: ${{ fromJSON(inputs.envs) }} + run: | + if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then + export SKIP_PACKAGING=false + echo "Python Packaging Tests Enabled!" + fi + ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST" + - name: Upload test results to report + env: ${{ fromJSON(inputs.envs) }} + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} + path: "**/target/test-reports/*.xml" + - name: Upload unit tests log files + env: ${{ fromJSON(inputs.envs) }} + if: ${{ !success() }} + uses: actions/upload-artifact@v4 + with: + name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} + path: "**/target/unit-tests.log" diff --git a/assembly/README b/assembly/README index ad1305c5b4d5..10c8254ae153 100644 --- a/assembly/README +++ b/assembly/README @@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command If you need to build an assembly for a different version of Hadoop the hadoop-version system property needs to be set as in this example: - -Dhadoop.version=3.4.0 + -Dhadoop.version=3.4.1 diff --git a/build/mvn b/build/mvn index 060209ac1ac4..fef589fc0347 100755 --- a/build/mvn +++ b/build/mvn @@ -56,7 +56,7 @@ install_app() { local binary="${_DIR}/$6" local remote_tarball="${mirror_host}/${url_path}${url_query}" local local_checksum="${local_tarball}.${checksum_suffix}" - local remote_checksum="https://archive.apache.org/dist/${url_path}.${checksum_suffix}" + local remote_checksum="${mirror_host}/${url_path}.${checksum_suffix}${url_query}" local curl_opts="--retry 3 --silent --show-error -L" local wget_opts="--no-verbose" diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index ad5e5ae845f8..4064f830e92d 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -1023,12 +1023,14 @@ protected Collation buildCollation() { @Override protected CollationMeta buildCollationMeta() { + String language = ICULocaleMap.get(locale).getDisplayLanguage(); + String country = ICULocaleMap.get(locale).getDisplayCountry(); return new CollationMeta( CATALOG, SCHEMA, normalizedCollationName(), - ICULocaleMap.get(locale).getDisplayLanguage(), - ICULocaleMap.get(locale).getDisplayCountry(), + language.isEmpty() ? null : language, + country.isEmpty() ? null : country, VersionInfo.ICU_VERSION.toString(), COLLATION_PAD_ATTRIBUTE, accentSensitivity == AccentSensitivity.AS, diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index e90e6a5c083b..a85ef5d4774f 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -497,6 +497,12 @@ ], "sqlState" : "56000" }, + "CIRCULAR_CLASS_REFERENCE" : { + "message" : [ + "Cannot have circular references in class, but got the circular reference of class ." + ], + "sqlState" : "42602" + }, "CLASS_NOT_OVERRIDE_EXPECTED_METHOD" : { "message" : [ " must override either or ." @@ -959,7 +965,7 @@ }, "RANGE_FRAME_INVALID_TYPE" : { "message" : [ - "The data type used in the order specification does not match the data type which is used in the range frame." + "The data type used in the order specification does not support the data type which is used in the range frame." ] }, "RANGE_FRAME_MULTI_ORDER" : { @@ -1101,6 +1107,12 @@ ], "sqlState" : "42K03" }, + "DATETIME_FIELD_OUT_OF_BOUNDS" : { + "message" : [ + ". If necessary set to \"false\" to bypass this error." + ], + "sqlState" : "22023" + }, "DATETIME_OVERFLOW" : { "message" : [ "Datetime operation overflow: ." @@ -2012,8 +2024,20 @@ }, "INTERVAL_ARITHMETIC_OVERFLOW" : { "message" : [ - "." + "Integer overflow while operating with intervals." ], + "subClass" : { + "WITHOUT_SUGGESTION" : { + "message" : [ + "Try devising appropriate values for the interval parameters." + ] + }, + "WITH_SUGGESTION" : { + "message" : [ + "Use to tolerate overflow and return NULL instead." + ] + } + }, "sqlState" : "22015" }, "INTERVAL_DIVIDED_BY_ZERO" : { @@ -2597,6 +2621,12 @@ }, "sqlState" : "22006" }, + "INVALID_INTERVAL_WITH_MICROSECONDS_ADDITION" : { + "message" : [ + "Cannot add an interval to a date because its microseconds part is not 0. If necessary set to \"false\" to bypass this error." + ], + "sqlState" : "22006" + }, "INVALID_INVERSE_DISTRIBUTION_FUNCTION" : { "message" : [ "Invalid inverse distribution function ." @@ -2645,6 +2675,12 @@ ], "sqlState" : "2203G" }, + "INVALID_JSON_RECORD_TYPE" : { + "message" : [ + "Detected an invalid type of a JSON record while inferring a common schema in the mode . Expected a STRUCT type, but found ." + ], + "sqlState" : "22023" + }, "INVALID_JSON_ROOT_FIELD" : { "message" : [ "Cannot convert JSON root field to target Spark type." @@ -2738,12 +2774,6 @@ }, "sqlState" : "42K0E" }, - "INVALID_LOCATION" : { - "message" : [ - "The location name cannot be an invalid URI, but `` was given." - ], - "sqlState" : "42K05" - }, "INVALID_NON_DETERMINISTIC_EXPRESSIONS" : { "message" : [ "The operator expects a deterministic expression, but the actual expression is ." @@ -2950,6 +2980,12 @@ }, "sqlState" : "42601" }, + "INVALID_PARTITION_VALUE" : { + "message" : [ + "Failed to cast value to data type for partition column . Ensure the value matches the expected data type for this partition column." + ], + "sqlState" : "42846" + }, "INVALID_PROPERTY_KEY" : { "message" : [ " is an invalid property key, please use quotes, e.g. SET =." @@ -3375,6 +3411,12 @@ ], "sqlState" : "42K0L" }, + "LABEL_ALREADY_EXISTS" : { + "message" : [ + "The label