diff --git a/MANIFEST.in b/MANIFEST.in index 09e3734..e28dca0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -18,6 +18,7 @@ global-exclude *.py[cod] __pycache__ .DS_Store recursive-include deps/jars *.jar +recursive-include deps/hadoop *.jar include README.md include LICENSE include NOTICE diff --git a/dev/build-source-distribution-package.sh b/dev/build-source-distribution-package.sh index 26a8807..128700b 100755 --- a/dev/build-source-distribution-package.sh +++ b/dev/build-source-distribution-package.sh @@ -19,20 +19,34 @@ CURR_DIR=`pwd` BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" PROJECT_ROOT="${BASE_DIR}/../" -# prepare bridge jar - -DEPS_DIR=${PROJECT_ROOT}/deps/jars +DEPS_DIR=${PROJECT_ROOT}/deps rm -rf ${DEPS_DIR} -mkdir -p ${DEPS_DIR} -touch ${DEPS_DIR}/__init__.py + +# prepare bridge jar +BRIDGE_DEPS_DIR=${DEPS_DIR}/jars +mkdir -p ${BRIDGE_DEPS_DIR} +touch ${BRIDGE_DEPS_DIR}/__init__.py cd ${PROJECT_ROOT}/paimon-python-java-bridge # get bridge jar version -JAR_VERSION=$(sed -n 's/.*\(.*\)<\/version>.*/\1/p' pom.xml | head -n 1) +BRIDGE_JAR_VERSION=$(sed -n 's/.*\(.*\)<\/version>.*/\1/p' pom.xml | head -n 1) + +mvn clean install -DskipTests +cp "target/paimon-python-java-bridge-${BRIDGE_JAR_VERSION}.jar" ${BRIDGE_DEPS_DIR} + +# prepare hadoop-deps jar +HADOOP_DEPS_DIR=${DEPS_DIR}/hadoop +mkdir -p ${HADOOP_DEPS_DIR} +touch ${HADOOP_DEPS_DIR}/__init__.py + +cd ${PROJECT_ROOT}/hadoop-deps + +# get hadoop-deps jar version +HADOOP_JAR_VERSION=$(sed -n 's/.*\(.*\)<\/version>.*/\1/p' pom.xml | head -n 1) mvn clean install -DskipTests -cp "target/paimon-python-java-bridge-${JAR_VERSION}.jar" ${DEPS_DIR} +cp "target/hadoop-deps-${HADOOP_JAR_VERSION}.jar" ${HADOOP_DEPS_DIR} cd ${CURR_DIR} diff --git a/dev/lint-python.sh b/dev/lint-python.sh index e9d3e5e..33218f2 100755 --- a/dev/lint-python.sh +++ b/dev/lint-python.sh @@ -580,6 +580,8 @@ function tox_check() { # dummy jar needed by setup.py mkdir -p $PAIMON_PYTHON_DIR/deps/jars touch $PAIMON_PYTHON_DIR/deps/jars/dummy.jar + mkdir -p $PAIMON_PYTHON_DIR/deps/hadoop + touch $PAIMON_PYTHON_DIR/deps/hadoop/dummy.jar if [[ -n "$GITHUB_ACTION" ]]; then # Run tests in all versions triggered by a Git push (tests aren't so many currently) diff --git a/hadoop-deps/pom.xml b/hadoop-deps/pom.xml new file mode 100644 index 0000000..fb1decd --- /dev/null +++ b/hadoop-deps/pom.xml @@ -0,0 +1,73 @@ + + + + 4.0.0 + + org.apache.pypaimon + hadoop-deps + 3.3.4 + + + 3.3.4 + 2.17.1 + + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + org.apache.hadoop + hadoop-hdfs-client + ${hadoop.version} + + + org.apache.logging.log4j + log4j-api + ${log4j.version} + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + package + + shade + + + false + + + + + + + + + + diff --git a/pypaimon/py4j/gateway_server.py b/pypaimon/py4j/gateway_server.py index f3a0fda..2588217 100644 --- a/pypaimon/py4j/gateway_server.py +++ b/pypaimon/py4j/gateway_server.py @@ -102,12 +102,20 @@ def _get_classpath(env): return os.pathsep.join(classpath) +_HADOOP_DEPS_PACKAGE = 'pypaimon.hadoop-deps' + + def _get_hadoop_classpath(env): if constants.PYPAIMON_HADOOP_CLASSPATH in env: return env[constants.PYPAIMON_HADOOP_CLASSPATH] - - if 'HADOOP_CLASSPATH' in env: + elif 'HADOOP_CLASSPATH' in env: return env['HADOOP_CLASSPATH'] else: - raise EnvironmentError(f"You haven't set '{constants.PYPAIMON_HADOOP_CLASSPATH}', \ - and 'HADOOP_CLASSPATH' is also not set. Ensure one of them is set.") + # use built-in hadoop + jars = importlib.resources.files(_HADOOP_DEPS_PACKAGE) + one_jar = next(iter(jars.iterdir()), None) + if not one_jar: + raise EnvironmentError(f"The built-in Hadoop environment has been broken, this \ + is unexpected. You can set one of '{constants.PYPAIMON_HADOOP_CLASSPATH}' or \ + 'HADOOP_CLASSPATH' to continue.") + return os.path.join(os.path.dirname(str(one_jar)), '*') diff --git a/setup.py b/setup.py index 4fc12a6..98515e0 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,8 @@ 'pypaimon.api', 'pypaimon.py4j', 'pypaimon.py4j.util', - 'pypaimon.jars' + 'pypaimon.jars', + 'pypaimon.hadoop-deps' ] install_requires = [ @@ -57,10 +58,12 @@ include_package_data=True, # releasing tool will generate deps package_dir={ - "pypaimon.jars": "deps/jars" + "pypaimon.jars": "deps/jars", + "pypaimon.hadoop-deps": "deps/hadoop" }, package_data={ - "pypaimon.jars": ["*.jar"] + "pypaimon.jars": ["*.jar"], + "pypaimon.hadoop-deps": ["*.jar"] }, install_requires=install_requires, description='Apache Paimon Python API', diff --git a/tools/releasing/create_source_release.sh b/tools/releasing/create_source_release.sh index ad12a42..5e79be1 100755 --- a/tools/releasing/create_source_release.sh +++ b/tools/releasing/create_source_release.sh @@ -55,11 +55,13 @@ fi ########################### -# prepare bridge jar - -DEPS_DIR=${PROJECT_ROOT}/deps/jars +DEPS_DIR=${PROJECT_ROOT}/deps rm -rf ${DEPS_DIR} -mkdir -p ${DEPS_DIR} + +# prepare bridge jar +BRIDGE_DEPS_DIR=${DEPS_DIR}/jars +mkdir -p ${BRIDGE_DEPS_DIR} +touch ${BRIDGE_DEPS_DIR}/__init__.py cd ${PROJECT_ROOT}/paimon-python-java-bridge @@ -70,10 +72,23 @@ if grep -q ".*SNAPSHOT" "pom.xml"; then fi # get bridge jar version -JAR_VERSION=$(sed -n 's/.*\(.*\)<\/version>.*/\1/p' pom.xml | head -n 1) +BRIDGE_JAR_VERSION=$(sed -n 's/.*\(.*\)<\/version>.*/\1/p' pom.xml | head -n 1) + +mvn clean install -DskipTests +cp "target/paimon-python-java-bridge-${BRIDGE_JAR_VERSION}.jar" ${BRIDGE_DEPS_DIR} + +# prepare hadoop-deps jar +HADOOP_DEPS_DIR=${DEPS_DIR}/hadoop +mkdir -p ${HADOOP_DEPS_DIR} +touch ${HADOOP_DEPS_DIR}/__init__.py + +cd ${PROJECT_ROOT}/hadoop-deps + +# get hadoop-deps jar version +HADOOP_JAR_VERSION=$(sed -n 's/.*\(.*\)<\/version>.*/\1/p' pom.xml | head -n 1) mvn clean install -DskipTests -cp "target/paimon-python-java-bridge-${JAR_VERSION}.jar" ${DEPS_DIR} +cp "target/hadoop-deps-${HADOOP_JAR_VERSION}.jar" ${HADOOP_DEPS_DIR} cd ${CURR_DIR}