Skip to content
Closed
6 changes: 5 additions & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -648,8 +648,12 @@ jobs:
run: |
python3.11 -m pip install 'black==23.9.1' 'protobuf==5.28.3' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
python3.11 -m pip list
- name: Python CodeGen check
- name: Python CodeGen check for branch-3.5
if: inputs.branch == 'branch-3.5'
run: ./dev/connect-check-protos.py
- name: Python CodeGen check
if: inputs.branch != 'branch-3.5'
run: ./dev/check-protos.py

# Static analysis
lint:
Expand Down
23 changes: 13 additions & 10 deletions dev/connect-check-protos.py → dev/check-protos.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#

# Utility for checking whether generated codes in PySpark are out of sync.
# usage: ./dev/connect-check-protos.py
# usage: ./dev/check-protos.py

import os
import sys
Expand All @@ -43,12 +43,12 @@ def run_cmd(cmd):
return subprocess.check_output(cmd.split(" ")).decode("utf-8")


def check_connect_protos():
print("Start checking the generated codes in pyspark-connect.")
with tempfile.TemporaryDirectory(prefix="check_connect_protos") as tmp:
run_cmd(f"{SPARK_HOME}/dev/connect-gen-protos.sh {tmp}")
def check_protos(module_name, cmp_path, proto_path):
print(f"Start checking the generated codes in pyspark-${module_name}.")
with tempfile.TemporaryDirectory(prefix=f"check_${module_name}__protos") as tmp:
run_cmd(f"{SPARK_HOME}/dev/gen-protos.sh {module_name} {tmp}")
result = filecmp.dircmp(
f"{SPARK_HOME}/python/pyspark/sql/connect/proto/",
f"{SPARK_HOME}/{cmp_path}",
tmp,
ignore=["__init__.py", "__pycache__"],
)
Expand All @@ -71,14 +71,17 @@ def check_connect_protos():
success = False

if success:
print("Finish checking the generated codes in pyspark-connect: SUCCESS")
print(f"Finish checking the generated codes in pyspark-${module_name}: SUCCESS")
else:
fail(
"Generated files for pyspark-connect are out of sync! "
"If you have touched files under sql/connect/common/src/main/protobuf/, "
"please run ./dev/connect-gen-protos.sh. "
f"If you have touched files under ${proto_path}, "
f"please run ./dev/${module_name}-gen-protos.sh. "
"If you haven't touched any file above, please rebase your PR against main branch."
)


check_connect_protos()
check_protos(
"connect", "python/pyspark/sql/connect/proto/", "sql/connect/common/src/main/protobuf/"
)
check_protos("streaming", "python/pyspark/sql/streaming/proto/", "sql/core/src/main/protobuf/")
78 changes: 1 addition & 77 deletions dev/connect-gen-protos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,80 +24,4 @@ if [[ $# -gt 1 ]]; then
exit -1
fi


SPARK_HOME="$(cd "`dirname $0`"/..; pwd)"
cd "$SPARK_HOME"


OUTPUT_PATH=${SPARK_HOME}/python/pyspark/sql/connect/proto/
if [[ $# -eq 1 ]]; then
rm -Rf $1
mkdir -p $1
OUTPUT_PATH=$1
fi

pushd sql/connect/common/src/main

LICENSE=$(cat <<'EOF'
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
EOF)
echo "$LICENSE" > /tmp/tmp_licence


# Delete the old generated protobuf files.
rm -Rf gen

# Now, regenerate the new files
buf generate --debug -vvv

# We need to edit the generate python files to account for the actual package location and not
# the one generated by proto.
for f in `find gen/proto/python -name "*.py*"`; do
# First fix the imports.
if [[ $f == *_pb2.py || $f == *_pb2_grpc.py ]]; then
sed -e 's/from spark.connect import/from pyspark.sql.connect.proto import/g' $f > $f.tmp
mv $f.tmp $f
# Now fix the module name in the serialized descriptor.
sed -e "s/DESCRIPTOR, 'spark.connect/DESCRIPTOR, 'pyspark.sql.connect.proto/g" $f > $f.tmp
mv $f.tmp $f
elif [[ $f == *.pyi ]]; then
sed -e 's/import spark.connect./import pyspark.sql.connect.proto./g' -e 's/spark.connect./pyspark.sql.connect.proto./g' -e '/ *@typing_extensions\.final/d' $f > $f.tmp
mv $f.tmp $f
fi

# Prepend the Apache licence header to the files.
cp $f $f.bak
cat /tmp/tmp_licence $f.bak > $f

LC=$(wc -l < $f)
echo $LC
if [[ $f == *_grpc.py && $LC -eq 20 ]]; then
rm $f
fi
rm $f.bak
done

black --config $SPARK_HOME/dev/pyproject.toml gen/proto/python

# Last step copy the result files to the destination module.
for f in `find gen/proto/python -name "*.py*"`; do
cp $f $OUTPUT_PATH
done

# Clean up everything.
rm -Rf gen
./dev/gen-protos.sh connect "$@"
127 changes: 127 additions & 0 deletions dev/gen-protos.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

set -ex

SPARK_HOME="$(cd "`dirname $0`"/..; pwd)"
cd "$SPARK_HOME"

OUTPUT_PATH=""
MODULE=""
SOURCE_MODULE=""
TARGET_MODULE=""

function usage() {
echo "Illegal number of parameters."
echo "Usage:./dev/gen-protos.sh [connect|streaming] [output_path]"
exit -1
}

if [[ $# -lt 1 || $# -gt 2 ]]; then
usage
fi

if [[ $1 == "connect" ]]; then
MODULE="connect"
OUTPUT_PATH=${SPARK_HOME}/python/pyspark/sql/connect/proto/
SOURCE_MODULE="spark.connect"
TARGET_MODULE="pyspark.sql.connect.proto"
elif [[ $1 == "streaming" ]]; then
MODULE="streaming"
OUTPUT_PATH=${SPARK_HOME}/python/pyspark/sql/streaming/proto/
SOURCE_MODULE="org.apache.spark.sql.execution.streaming"
TARGET_MODULE="pyspark.sql.streaming.proto"
else
usage
fi

if [[ $# -eq 2 ]]; then
rm -Rf $2
mkdir -p $2
OUTPUT_PATH=$2
fi

if [[ $MODULE == "connect" ]]; then
pushd sql/connect/common/src/main
elif [[ $MODULE == "streaming" ]]; then
pushd sql/core/src/main
fi

LICENSE=$(cat <<'EOF'
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
EOF)
echo "$LICENSE" > /tmp/tmp_licence

# Delete the old generated protobuf files.
rm -Rf gen

# Now, regenerate the new files
buf generate --debug -vvv

# We need to edit the generate python files to account for the actual package location and not
# the one generated by proto.
for f in `find gen/proto/python -name "*.py*"`; do
# First fix the imports.
if [[ $f == *_pb2.py || $f == *_pb2_grpc.py ]]; then
sed -e "s/from ${SOURCE_MODULE} import/from ${TARGET_MODULE} import/g" $f > $f.tmp
mv $f.tmp $f
# Now fix the module name in the serialized descriptor.
sed -e "s/DESCRIPTOR, '${SOURCE_MODULE}/DESCRIPTOR, '${TARGET_MODULE}/g" $f > $f.tmp
mv $f.tmp $f
elif [[ $f == *.pyi ]]; then
sed -e "s/import ${SOURCE_MODULE}./import ${TARGET_MODULE}./g" -e "s/${SOURCE_MODULE}./${TARGET_MODULE}./g" -e '/ *@typing_extensions\.final/d' $f > $f.tmp
mv $f.tmp $f
fi

# Prepend the Apache licence header to the files.
cp $f $f.bak
cat /tmp/tmp_licence $f.bak > $f

LC=$(wc -l < $f)
echo $LC
if [[ $f == *_grpc.py && $LC -eq 20 ]]; then
rm $f
fi
rm $f.bak
done

black --config $SPARK_HOME/dev/pyproject.toml gen/proto/python

# Last step copy the result files to the destination module.
for f in `find gen/proto/python -name "*.py*"`; do
cp $f $OUTPUT_PATH
done

# Clean up everything.
rm -Rf gen
27 changes: 27 additions & 0 deletions dev/streaming-gen-protos.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
set -ex

if [[ $# -gt 1 ]]; then
echo "Illegal number of parameters."
echo "Usage: ./dev/streaming-gen-protos.sh [path]"
exit -1
fi

./dev/gen-protos.sh streaming "$@"
1 change: 1 addition & 0 deletions dev/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,6 @@ exclude =
*python/pyspark/worker.pyi,
*python/pyspark/java_gateway.pyi,
*python/pyspark/sql/connect/proto/*,
*python/pyspark/sql/streaming/proto/*,
*/venv/*
max-line-length = 100
173 changes: 90 additions & 83 deletions python/pyspark/sql/streaming/proto/StateMessage_pb2.py

Large diffs are not rendered by default.

Loading