Skip to content
Closed
3 changes: 3 additions & 0 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,9 @@ jobs:
run: ./dev/lint-scala
- name: Java linter
run: ./dev/lint-java
- name: Spark connect jvm client mima check
if: inputs.branch != 'branch-3.2' && inputs.branch != 'branch-3.3'
run: ./dev/connect-jvm-client-mima-check
- name: Install Python linter dependencies
run: |
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,83 @@
*/
package org.apache.spark.sql.connect.client

import java.io.File
import java.io.{File, Writer}
import java.net.URLClassLoader
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import java.util.regex.Pattern

import com.typesafe.tools.mima.core._
import scala.reflect.runtime.universe.runtimeMirror

import com.typesafe.tools.mima.core.{Problem, ProblemFilter, ProblemFilters}
import com.typesafe.tools.mima.lib.MiMaLib

import org.apache.spark.sql.connect.client.util.ConnectFunSuite
import org.apache.spark.sql.connect.client.util.IntegrationTestUtils._
import org.apache.spark.util.ChildFirstURLClassLoader

/**
* This test checks the binary compatibility of the connect client API against the spark SQL API
* using MiMa. We did not write this check using a SBT build rule as the rule cannot provide the
* same level of freedom as a test. With a test we can:
* A tool for checking the binary compatibility of the connect client API against the spark SQL
* API using MiMa. We did not write this check using a SBT build rule as the rule cannot provide
* the same level of freedom as a test. With a test we can:
* 1. Specify any two jars to run the compatibility check.
* 1. Easily make the test automatically pick up all new methods added while the client is being
* built.
*
* The test requires the following artifacts built before running:
* {{{
* spark-sql
* spark-connect-client-jvm
* }}}
* To build the above artifact, use e.g. `build/sbt package` or `build/mvn clean install
* -DskipTests`.
*
* When debugging this test, if any changes to the client API, the client jar need to be built
* before running the test. An example workflow with SBT for this test:
* 1. Compatibility test has reported an unexpected client API change.
* 1. Fix the wrong client API.
* 1. Build the client jar: `build/sbt package`
* 1. Run the test again: `build/sbt "testOnly
* org.apache.spark.sql.connect.client.CompatibilitySuite"`
* We can run this check by executing the `dev/connect-jvm-client-mima-check`.
*/
class CompatibilitySuite extends ConnectFunSuite {
// scalastyle:off println
object CheckConnectJvmClientCompatibility {

private lazy val clientJar: File =
findJar(
"connector/connect/client/jvm",
"spark-connect-client-jvm-assembly",
"spark-connect-client-jvm")
private lazy val sparkHome: String = {
if (!sys.env.contains("SPARK_HOME")) {
throw new IllegalArgumentException("SPARK_HOME is not set.")
}
sys.env("SPARK_HOME")
}

private lazy val sqlJar: File = findJar("sql/core", "spark-sql", "spark-sql")
def main(args: Array[String]): Unit = {
var resultWriter: Writer = null
try {
resultWriter = Files.newBufferedWriter(
Paths.get(s"$sparkHome/.connect-mima-check-result"),
StandardCharsets.UTF_8)
val clientJar: File =
findJar(
"connector/connect/client/jvm",
"spark-connect-client-jvm-assembly",
"spark-connect-client-jvm")
val sqlJar: File = findJar("sql/core", "spark-sql", "spark-sql")
val problems = checkMiMaCompatibility(clientJar, sqlJar)
if (problems.nonEmpty) {
resultWriter.write(s"ERROR: Comparing client jar: $clientJar and and sql jar: $sqlJar \n")
resultWriter.write(s"problems: \n")
resultWriter.write(s"${problems.map(p => p.description("client")).mkString("\n")}")
resultWriter.write("\n")
resultWriter.write(
"Exceptions to binary compatibility can be added in " +
"'CheckConnectJvmClientCompatibility#checkMiMaCompatibility'\n")
}
val incompatibleApis = checkDatasetApiCompatibility(clientJar, sqlJar)
if (incompatibleApis.nonEmpty) {
resultWriter.write(
"ERROR: The Dataset apis only exist in the connect client " +
"module and not belong to the sql module include: \n")
resultWriter.write(incompatibleApis.mkString("\n"))
resultWriter.write("\n")
resultWriter.write(
"Exceptions can be added to exceptionMethods in " +
"'CheckConnectJvmClientCompatibility#checkDatasetApiCompatibility'\n")
}
} catch {
case e: Throwable =>
println(e.getMessage)
resultWriter.write(s"ERROR: ${e.getMessage}")
} finally {
if (resultWriter != null) {
resultWriter.close()
}
}
}

/**
* MiMa takes an old jar (sql jar) and a new jar (client jar) as inputs and then reports all
Expand All @@ -67,7 +101,7 @@ class CompatibilitySuite extends ConnectFunSuite {
* need to be checked. Then exclude rules are applied to filter out all unsupported methods in
* the client classes.
*/
test("compatibility MiMa tests") {
private def checkMiMaCompatibility(clientJar: File, sqlJar: File): List[Problem] = {
val mima = new MiMaLib(Seq(clientJar, sqlJar))
val allProblems = mima.collectProblems(sqlJar, clientJar, List.empty)
val includedRules = Seq(
Expand Down Expand Up @@ -184,30 +218,36 @@ class CompatibilitySuite extends ConnectFunSuite {
.filter { p =>
excludeRules.forall(rule => rule(p))
}

if (problems.nonEmpty) {
fail(
s"\nComparing client jar: $clientJar\nand sql jar: $sqlJar\n" +
problems.map(p => p.description("client")).mkString("\n"))
}
problems
}

test("compatibility API tests: Dataset") {
val clientClassLoader: URLClassLoader = new URLClassLoader(Seq(clientJar.toURI.toURL).toArray)
val sqlClassLoader: URLClassLoader = new URLClassLoader(Seq(sqlJar.toURI.toURL).toArray)
private def checkDatasetApiCompatibility(clientJar: File, sqlJar: File): Seq[String] = {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: When CheckConnectJvmClientCompatibility in the connect/jvm module, this check will fail due to #39712 (comment)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, I would recommend to use this PR. For this failing tests. Let's just delete this. I was planning to delete it after this PR get merged. The test case is already covered by mima test. Thanks so much. This change looks awesome.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zhenlineo is this only supposed to check Dataset? what about the other classes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was initially added because we did not cover the Dataset in the mima check.

If we found such test is useful and want to expand to more classes we can run the mima check the other way around to ensure that our new code does not introduce more unwanted methods.


val clientClass = clientClassLoader.loadClass("org.apache.spark.sql.Dataset")
val sqlClass = sqlClassLoader.loadClass("org.apache.spark.sql.Dataset")
def methods(jar: File, className: String): Seq[String] = {
val classLoader: URLClassLoader =
new ChildFirstURLClassLoader(Seq(jar.toURI.toURL).toArray, this.getClass.getClassLoader)
val mirror = runtimeMirror(classLoader)
// scalastyle:off classforname
val classSymbol =
mirror.classSymbol(Class.forName(className, false, classLoader))
// scalastyle:on classforname
classSymbol.typeSignature.members
.filter(_.isMethod)
.map(_.asMethod)
.filter(m => m.isPublic)
.map(_.fullName)
.toSeq
}

val newMethods = clientClass.getMethods
val oldMethods = sqlClass.getMethods
val className = "org.apache.spark.sql.Dataset"
val clientMethods = methods(clientJar, className)
val sqlMethods = methods(sqlJar, className)
// Exclude some public methods that must be added through `exceptionMethods`
val exceptionMethods =
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hvanhovell exceptionMethods has been added and the revert the change of Dataset. Can we simply fix it like this first?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I can refine this check, I will give followup later

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks good

Seq("org.apache.spark.sql.Dataset.collectResult", "org.apache.spark.sql.Dataset.plan")

// For now we simply check the new methods is a subset of the old methods.
newMethods
.map(m => m.toString)
.foreach(method => {
assert(oldMethods.map(m => m.toString).contains(method))
})
// Find new public functions that are not in sql module `Dataset`.
clientMethods.diff(sqlMethods).diff(exceptionMethods)
}

private case class IncludeByName(name: String) extends ProblemFilter {
Expand Down
72 changes: 72 additions & 0 deletions dev/connect-jvm-client-mima-check
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

set -o pipefail
set -e

# Go to the Spark project root directory
FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
cd "$FWDIR"
export SPARK_HOME=$FWDIR
echo $SPARK_HOME

if [[ -x "$JAVA_HOME/bin/java" ]]; then
JAVA_CMD="$JAVA_HOME/bin/java"
else
JAVA_CMD=java
fi

rm -f .connect-mima-check-result

echo "Build sql module, connect-client-jvm module and connect-client-jvm test jar..."
build/sbt "sql/package;connect-client-jvm/assembly;connect-client-jvm/Test/package"

CONNECT_TEST_CLASSPATH="$(build/sbt -DcopyDependencies=false "export connect-client-jvm/Test/fullClasspath" | grep jar | tail -n1)"
CONNECT_CLASSPATH="$(build/sbt -DcopyDependencies=false "export connect-client-jvm/fullClasspath" | grep jar | tail -n1)"
SQL_CLASSPATH="$(build/sbt -DcopyDependencies=false "export sql/fullClasspath" | grep jar | tail -n1)"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There will be duplication of entries here. Is that an issue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, there will be duplicate entries, but there is no problem for the time being,



echo "Do connect-client-jvm module mima check ..."

$JAVA_CMD \
-Xmx2g \
-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.util.jar=ALL-UNNAMED \
-cp "$CONNECT_CLASSPATH:$CONNECT_TEST_CLASSPATH:$SQL_CLASSPATH" \
org.apache.spark.sql.connect.client.CheckConnectJvmClientCompatibility

echo "finish connect-client-jvm module mima check ..."

RESULT_SIZE=$(wc -l .connect-mima-check-result | awk '{print $1}')

# The the file has no content if check passed.
if [[ $RESULT_SIZE -eq "0" ]]; then
ERRORS=""
else
ERRORS=$(grep ERROR .connect-mima-check-result | tail -n1)
fi

if test ! -z "$ERRORS"; then
cat .connect-mima-check-result
echo -e "connect-client-jvm module mima check failed."
rm .connect-mima-check-result
exit 1
else
rm .connect-mima-check-result
echo -e "sql and connect-client-jvm module mima check passed."
fi