diff --git a/.gitignore b/.gitignore index a8104a162..0818ada9b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,7 @@ dependency-reduced-pom.xml core/src/execution/generated prebuild .flattened-pom.xml +rat.txt +filtered_rat.txt +dev/dist +apache-rat-*.jar diff --git a/core/Cargo.toml b/core/Cargo.toml index 4584dffce..49f9a48b4 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -16,8 +16,14 @@ # under the License. [package] -name = "comet" +name = "datafusion-comet" version = "0.1.0" +homepage = "https://datafusion.apache.org/comet" +repository = "https://github.com/apache/datafusion-comet" +authors = ["Apache DataFusion "] +description = "Apache DataFusion Comet: High performance accelerator for Apache Spark" +readme = "README.md" +license = "Apache-2.0" edition = "2021" include = [ "benches/*.rs", @@ -25,8 +31,6 @@ include = [ "Cargo.toml", ] -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] parquet-format = "4.0.0" # This must be kept in sync with that from parquet crate arrow = { git = "https://github.com/viirya/arrow-rs.git", rev = "3f1ae0c", features = ["prettyprint", "ffi", "chrono-tz"] } diff --git a/dev/release/README.md b/dev/release/README.md new file mode 100644 index 000000000..b20f2d48e --- /dev/null +++ b/dev/release/README.md @@ -0,0 +1,85 @@ + + +# Comet Release Process + +This documentation is for creating an official source release of Apache DataFusion Comet. + +The release process is based on the parent Apache DataFusion project, so please refer to the +[DataFusion Release Process](https://github.com/apache/datafusion/blob/main/dev/release/README.md) for detailed +instructions if you are not familiar with the release process here. + +Here is a brief overview of the steps involved in creating a release: + +## Creating the Release Candidate + +This part of the process can be performed by any committer. + +- Create and merge a PR to update the version number & update the changelog +- Push a release candidate tag (e.g. 0.1.0-rc1) to the Apache repository + +## Publishing the Release Candidate + +This part of the process can mostly only be performed by a PMC member. + +- Run the create-tarball script to create the source tarball and upload it to the dev subversion repository +- Start an email voting thread +- Once the vote passes, run the release-tarball script to move the tarball to the release subversion repository +- Register the release with the [Apache Reporter Service](https://reporter.apache.org/addrelease.html?datafusion) using + a version such as `COMET-0.1.0` +- Delete old release candidates and releases from the subversion repositories +- Push a release tag (e.g. 0.1.0) to the Apache repository +- Reply to the vote thread to close the vote and announce the release + +## Publishing JAR Files to Maven + +The process for publishing JAR files to Maven is not defined yet. + +## Publishing to crates.io + +We may choose to publish the `datafusion-comet` to crates.io so that other Rust projects can leverage the +Spark-compatible operators and expressions outside of Spark. + +## Verifying Release Candidates + +The vote email will link to this section of this document, so this is where we will need to provide instructions for +verifying a release candidate. + +The `dev/release/verify-release-candidate.sh` is a script in this repository that can assist in the verification +process. It checks the hashes and runs the build. It does not run the test suite because this takes a long time +for this project and the test suites already run in CI before we create the release candidate, so running them +again is somewhat redundant. + +```shell +./dev/release/verify-release-candidate.sh 0.1.0 1 +``` + +We hope that users will verify the release beyond running this script by testing the release candidate with their +existing Spark jobs and report any functional issues or performance regressions. + +Another way of verifying the release is to follow the +[Comet Benchmarking Guide](https://datafusion.apache.org/comet/contributor-guide/benchmarking.html) and compare +performance with the previous release. + +## Post Release Activities + +Writing a blog post about the release is a great way to generate more interest in the project. We typically create a +Google document where the community can collaborate on a blog post. Once the content is agreed then a PR can be +created against the [datafusion-site](https://github.com/apache/datafusion-site) repository to add the blog post. Any +contributor can drive this process. diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py new file mode 100644 index 000000000..e30d72bdd --- /dev/null +++ b/dev/release/check-rat-report.py @@ -0,0 +1,59 @@ +#!/usr/bin/python +############################################################################## +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +############################################################################## +import fnmatch +import re +import sys +import xml.etree.ElementTree as ET + +if len(sys.argv) != 3: + sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % + sys.argv[0]) + sys.exit(1) + +exclude_globs_filename = sys.argv[1] +xml_filename = sys.argv[2] + +globs = [line.strip() for line in open(exclude_globs_filename, "r")] + +tree = ET.parse(xml_filename) +root = tree.getroot() +resources = root.findall('resource') + +all_ok = True +for r in resources: + approvals = r.findall('license-approval') + if not approvals or approvals[0].attrib['name'] == 'true': + continue + clean_name = re.sub('^[^/]+/', '', r.attrib['name']) + excluded = False + for g in globs: + if fnmatch.fnmatch(clean_name, g): + excluded = True + break + if not excluded: + sys.stdout.write("NOT APPROVED: %s (%s): %s\n" % ( + clean_name, r.attrib['name'], approvals[0].attrib['name'])) + all_ok = False + +if not all_ok: + sys.exit(1) + +print('OK') +sys.exit(0) diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh new file mode 100755 index 000000000..367dcae8e --- /dev/null +++ b/dev/release/create-tarball.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/create-tarball.sh + +# This script creates a signed tarball in +# dev/dist/apache-datafusion-comet--.tar.gz and uploads it to +# the "dev" area of the dist.apache.datafusion repository and prepares an +# email for sending to the dev@datafusion.apache.org list for a formal +# vote. +# +# See release/README.md for full release instructions +# +# Requirements: +# +# 1. gpg setup for signing and have uploaded your public +# signature to https://pgp.mit.edu/ +# +# 2. Logged into the apache svn server with the appropriate +# credentials +# +# 3. Install the requests python package +# +# +# Based in part on 02-source.sh from apache/arrow +# + +set -e + +DEV_RELEASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DEV_RELEASE_TOP_DIR="$(cd "${DEV_RELEASE_DIR}/../../" && pwd)" + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 4.1.0 2" + exit +fi + +if [[ -z "${GH_TOKEN}" ]]; then + echo "Please set personal github token through GH_TOKEN environment variable" + exit +fi + +version=$1 +rc=$2 +tag="${version}-rc${rc}" + +echo "Attempting to create ${tarball} from tag ${tag}" +release_hash=$(cd "${DEV_RELEASE_TOP_DIR}" && git rev-list --max-count=1 ${tag}) + +release=apache-datafusion-comet-${version} +distdir=${DEV_RELEASE_TOP_DIR}/dev/dist/${release}-rc${rc} +tarname=${release}.tar.gz +tarball=${distdir}/${tarname} +url="https://dist.apache.org/repos/dist/dev/datafusion/${release}-rc${rc}" + +if [ -z "$release_hash" ]; then + echo "Cannot continue: unknown git tag: ${tag}" +fi + +echo "Draft email for dev@datafusion.apache.org mailing list" +echo "" +echo "---------------------------------------------------------" +cat < containing the files in git at $release_hash +# the files in the tarball are prefixed with {version} (e.g. 4.0.1) +mkdir -p ${distdir} +(cd "${DEV_RELEASE_TOP_DIR}" && git archive ${release_hash} --prefix ${release}/ | gzip > ${tarball}) + +echo "Running rat license checker on ${tarball}" +${DEV_RELEASE_DIR}/run-rat.sh ${tarball} + +echo "Signing tarball and creating checksums" +gpg --armor --output ${tarball}.asc --detach-sig ${tarball} +# create signing with relative path of tarball +# so that they can be verified with a command such as +# shasum --check apache-datafusion-comet-0.1.0-rc1.tar.gz.sha512 +(cd ${distdir} && shasum -a 256 ${tarname}) > ${tarball}.sha256 +(cd ${distdir} && shasum -a 512 ${tarname}) > ${tarball}.sha512 + + +echo "Uploading to datafusion dist/dev to ${url}" +svn co --depth=empty https://dist.apache.org/repos/dist/dev/datafusion ${DEV_RELEASE_TOP_DIR}/dev/dist +svn add ${distdir} +svn ci -m "Apache DataFusion Comet ${version} ${rc}" ${distdir} diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt new file mode 100644 index 000000000..79d8db297 --- /dev/null +++ b/dev/release/rat_exclude_files.txt @@ -0,0 +1,16 @@ +*.gitignore +*.dockerignore +.github/pull_request_template.md +.gitmodules +core/Cargo.lock +core/testdata/backtrace.txt +core/testdata/stacktrace.txt +docs/spark_builtin_expr_coverage.txt +docs/source/contributor-guide/benchmark-results/**/*.json +rust-toolchain +spark/src/test/resources/tpcds-query-results/*.out +spark/src/test/resources/tpcds-plan-stability/approved-plans*/**/explain.txt +spark/src/test/resources/tpcds-plan-stability/approved-plans*/**/simplified.txt +spark/src/test/resources/tpch-query-results/*.out +spark/src/test/resources/tpch-extended/q1.sql +spark/inspections/CometTPC*results.txt diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh new file mode 100755 index 000000000..19eb4b449 --- /dev/null +++ b/dev/release/release-tarball.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/release-tarball.sh + +# This script copies a tarball from the "dev" area of the +# dist.apache.datafusion repository to the "release" area +# +# This script should only be run after the release has been approved +# by the Apache DataFusion PMC committee. +# +# See release/README.md for full release instructions +# +# Based in part on post-01-upload.sh from apache/arrow + + +set -e +set -u + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 4.1.0 2" + exit +fi + +version=$1 +rc=$2 + +tmp_dir=tmp-apache-datafusion-comet-dist + +echo "Recreate temporary directory: ${tmp_dir}" +rm -rf ${tmp_dir} +mkdir -p ${tmp_dir} + +echo "Clone dev dist repository" +svn \ + co \ + https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-comet-${version}-rc${rc} \ + ${tmp_dir}/dev + +echo "Clone release dist repository" +svn co https://dist.apache.org/repos/dist/release/datafusion ${tmp_dir}/release + +echo "Copy ${version}-rc${rc} to release working copy" +release_version=datafusion-comet-${version} +mkdir -p ${tmp_dir}/release/${release_version} +cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/ +svn add ${tmp_dir}/release/${release_version} + +echo "Commit release" +svn ci -m "Apache DataFusion Comet ${version}" ${tmp_dir}/release + +echo "Clean up" +rm -rf ${tmp_dir} + +echo "Success! The release is available here:" +echo " https://dist.apache.org/repos/dist/release/datafusion/${release_version}" diff --git a/dev/release/run-rat.sh b/dev/release/run-rat.sh new file mode 100755 index 000000000..c55c45b7f --- /dev/null +++ b/dev/release/run-rat.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +RAT_VERSION=0.13 + +# download apache rat +if [ ! -f apache-rat-${RAT_VERSION}.jar ]; then + curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar > apache-rat-${RAT_VERSION}.jar +fi + +RAT="java -jar apache-rat-${RAT_VERSION}.jar -x " + +RELEASE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) + +# generate the rat report +$RAT $1 > rat.txt +python3 $RELEASE_DIR/check-rat-report.py $RELEASE_DIR/rat_exclude_files.txt rat.txt > filtered_rat.txt +UNAPPROVED=`cat filtered_rat.txt | grep "NOT APPROVED" | wc -l` + +if [ "0" -eq "${UNAPPROVED}" ]; then + echo "No unapproved licenses" +else + echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt" + cat filtered_rat.txt + exit 1 +fi diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh new file mode 100755 index 000000000..da09e3d1b --- /dev/null +++ b/dev/release/verify-release-candidate.sh @@ -0,0 +1,133 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +case $# in + 2) VERSION="$1" + RC_NUMBER="$2" + ;; + *) echo "Usage: $0 X.Y.Z RC_NUMBER" + exit 1 + ;; +esac + +set -e +set -x +set -o pipefail + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +COMET_DIR="$(dirname $(dirname ${SOURCE_DIR}))" +COMET_DIST_URL='https://dist.apache.org/repos/dist/dev/datafusion' + +download_dist_file() { + curl \ + --silent \ + --show-error \ + --fail \ + --location \ + --remote-name $COMET_DIST_URL/$1 +} + +download_rc_file() { + download_dist_file apache-datafusion-comet-${VERSION}-rc${RC_NUMBER}/$1 +} + +import_gpg_keys() { + download_dist_file KEYS + gpg --import KEYS +} + +if type shasum >/dev/null 2>&1; then + sha256_verify="shasum -a 256 -c" + sha512_verify="shasum -a 512 -c" +else + sha256_verify="sha256sum -c" + sha512_verify="sha512sum -c" +fi + +fetch_archive() { + local dist_name=$1 + download_rc_file ${dist_name}.tar.gz + download_rc_file ${dist_name}.tar.gz.asc + download_rc_file ${dist_name}.tar.gz.sha256 + download_rc_file ${dist_name}.tar.gz.sha512 + verify_dir_artifact_signatures +} + +verify_dir_artifact_signatures() { + # verify the signature and the checksums of each artifact + find . -name '*.asc' | while read sigfile; do + artifact=${sigfile/.asc/} + gpg --verify $sigfile $artifact || exit 1 + + # go into the directory because the checksum files contain only the + # basename of the artifact + pushd $(dirname $artifact) + base_artifact=$(basename $artifact) + ${sha256_verify} $base_artifact.sha256 || exit 1 + ${sha512_verify} $base_artifact.sha512 || exit 1 + popd + done +} + +setup_tempdir() { + cleanup() { + if [ "${TEST_SUCCESS}" = "yes" ]; then + rm -fr "${COMET_TMPDIR}" + else + echo "Failed to verify release candidate. See ${COMET_TMPDIR} for details." + fi + } + + if [ -z "${COMET_TMPDIR}" ]; then + # clean up automatically if COMET_TMPDIR is not defined + COMET_TMPDIR=$(mktemp -d -t "$1.XXXXX") + trap cleanup EXIT + else + # don't clean up automatically + mkdir -p "${COMET_TMPDIR}" + fi +} + +test_source_distribution() { + set -e + pushd core + RUSTFLAGS="-Ctarget-cpu=native" cargo build --release + popd + # test with the latest supported version of Spark + ./mvnw verify -Prelease -DskipTests -P"spark-3.4" -Dmaven.gitcommitid.skip=true +} + +TEST_SUCCESS=no + +setup_tempdir "datafusion-comet-${VERSION}" +echo "Working in sandbox ${COMET_TMPDIR}" +cd ${COMET_TMPDIR} + +dist_name="apache-datafusion-comet-${VERSION}" +import_gpg_keys +fetch_archive ${dist_name} +tar xf ${dist_name}.tar.gz +pushd ${dist_name} + test_source_distribution +popd + +TEST_SUCCESS=yes +echo 'Release candidate looks good!' +exit 0 diff --git a/pom.xml b/pom.xml index 8c322bae0..b6eedd4aa 100644 --- a/pom.xml +++ b/pom.xml @@ -922,6 +922,7 @@ under the License. tpcds-sf-1/** tpch/** docs/*.txt + dev/release/rat_exclude_files.txt