apache · LuciferYang · Feb 28, 2023 · Feb 28, 2023 · Mar 1, 2023 · Mar 2, 2023
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -564,6 +564,9 @@ jobs:
       run: ./dev/lint-scala
     - name: Java linter
       run: ./dev/lint-java
+    - name: Spark connect jvm client mima check
+      if: inputs.branch != 'branch-3.2' && inputs.branch != 'branch-3.3'
+      run: ./dev/connect-jvm-client-mima-check
     - name: Install Python linter dependencies
       run: |
         # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.

diff --git a/...l/connect/client/CompatibilitySuite.scala → .../CheckConnectJvmClientCompatibility.scala b/...l/connect/client/CompatibilitySuite.scala → .../CheckConnectJvmClientCompatibility.scala
@@ -16,49 +16,83 @@
  */
 package org.apache.spark.sql.connect.client
 
-import java.io.File
+import java.io.{File, Writer}
 import java.net.URLClassLoader
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
 import java.util.regex.Pattern
 
-import com.typesafe.tools.mima.core._
+import scala.reflect.runtime.universe.runtimeMirror
+
+import com.typesafe.tools.mima.core.{Problem, ProblemFilter, ProblemFilters}
 import com.typesafe.tools.mima.lib.MiMaLib
 
-import org.apache.spark.sql.connect.client.util.ConnectFunSuite
 import org.apache.spark.sql.connect.client.util.IntegrationTestUtils._
+import org.apache.spark.util.ChildFirstURLClassLoader
 
 /**
- * This test checks the binary compatibility of the connect client API against the spark SQL API
- * using MiMa. We did not write this check using a SBT build rule as the rule cannot provide the
- * same level of freedom as a test. With a test we can:
+ * A tool for checking the binary compatibility of the connect client API against the spark SQL
+ * API using MiMa. We did not write this check using a SBT build rule as the rule cannot provide
+ * the same level of freedom as a test. With a test we can:
  *   1. Specify any two jars to run the compatibility check.
  *   1. Easily make the test automatically pick up all new methods added while the client is being
  *      built.
  *
- * The test requires the following artifacts built before running:
- * {{{
- *     spark-sql
- *     spark-connect-client-jvm
- * }}}
- * To build the above artifact, use e.g. `build/sbt package` or `build/mvn clean install
- * -DskipTests`.
- *
- * When debugging this test, if any changes to the client API, the client jar need to be built
- * before running the test. An example workflow with SBT for this test:
- *   1. Compatibility test has reported an unexpected client API change.
- *   1. Fix the wrong client API.
- *   1. Build the client jar: `build/sbt package`
- *   1. Run the test again: `build/sbt "testOnly
- *      org.apache.spark.sql.connect.client.CompatibilitySuite"`
+ * We can run this check by executing the `dev/connect-jvm-client-mima-check`.
  */
-class CompatibilitySuite extends ConnectFunSuite {
+// scalastyle:off println
+object CheckConnectJvmClientCompatibility {
 
-  private lazy val clientJar: File =
-    findJar(
-      "connector/connect/client/jvm",
-      "spark-connect-client-jvm-assembly",
-      "spark-connect-client-jvm")
+  private lazy val sparkHome: String = {
+    if (!sys.env.contains("SPARK_HOME")) {
+      throw new IllegalArgumentException("SPARK_HOME is not set.")
+    }
+    sys.env("SPARK_HOME")
+  }
 
-  private lazy val sqlJar: File = findJar("sql/core", "spark-sql", "spark-sql")
+  def main(args: Array[String]): Unit = {
+    var resultWriter: Writer = null
+    try {
+      resultWriter = Files.newBufferedWriter(
+        Paths.get(s"$sparkHome/.connect-mima-check-result"),
+        StandardCharsets.UTF_8)
+      val clientJar: File =
+        findJar(
+          "connector/connect/client/jvm",
+          "spark-connect-client-jvm-assembly",
+          "spark-connect-client-jvm")
+      val sqlJar: File = findJar("sql/core", "spark-sql", "spark-sql")
+      val problems = checkMiMaCompatibility(clientJar, sqlJar)
+      if (problems.nonEmpty) {
+        resultWriter.write(s"ERROR: Comparing client jar: $clientJar and and sql jar: $sqlJar \n")
+        resultWriter.write(s"problems: \n")
+        resultWriter.write(s"${problems.map(p => p.description("client")).mkString("\n")}")
+        resultWriter.write("\n")
+        resultWriter.write(
+          "Exceptions to binary compatibility can be added in " +
+            "'CheckConnectJvmClientCompatibility#checkMiMaCompatibility'\n")
+      }
+      val incompatibleApis = checkDatasetApiCompatibility(clientJar, sqlJar)
+      if (incompatibleApis.nonEmpty) {
+        resultWriter.write(
+          "ERROR: The Dataset apis only exist in the connect client " +
+            "module and not belong to the sql module include: \n")
+        resultWriter.write(incompatibleApis.mkString("\n"))
+        resultWriter.write("\n")
+        resultWriter.write(
+          "Exceptions can be added to exceptionMethods in " +
+            "'CheckConnectJvmClientCompatibility#checkDatasetApiCompatibility'\n")
+      }
+    } catch {
+      case e: Throwable =>
+        println(e.getMessage)
+        resultWriter.write(s"ERROR: ${e.getMessage}")
+    } finally {
+      if (resultWriter != null) {
+        resultWriter.close()
+      }
+    }
+  }
 
   /**
    * MiMa takes an old jar (sql jar) and a new jar (client jar) as inputs and then reports all
@@ -67,7 +101,7 @@ class CompatibilitySuite extends ConnectFunSuite {
    * need to be checked. Then exclude rules are applied to filter out all unsupported methods in
    * the client classes.
    */
-  test("compatibility MiMa tests") {
+  private def checkMiMaCompatibility(clientJar: File, sqlJar: File): List[Problem] = {
     val mima = new MiMaLib(Seq(clientJar, sqlJar))
     val allProblems = mima.collectProblems(sqlJar, clientJar, List.empty)
     val includedRules = Seq(
@@ -184,30 +218,36 @@ class CompatibilitySuite extends ConnectFunSuite {
       .filter { p =>
         excludeRules.forall(rule => rule(p))
       }
-
-    if (problems.nonEmpty) {
-      fail(
-        s"\nComparing client jar: $clientJar\nand sql jar: $sqlJar\n" +
-          problems.map(p => p.description("client")).mkString("\n"))
-    }
+    problems
   }
 
-  test("compatibility API tests: Dataset") {
-    val clientClassLoader: URLClassLoader = new URLClassLoader(Seq(clientJar.toURI.toURL).toArray)
-    val sqlClassLoader: URLClassLoader = new URLClassLoader(Seq(sqlJar.toURI.toURL).toArray)
+  private def checkDatasetApiCompatibility(clientJar: File, sqlJar: File): Seq[String] = {
 
-    val clientClass = clientClassLoader.loadClass("org.apache.spark.sql.Dataset")
-    val sqlClass = sqlClassLoader.loadClass("org.apache.spark.sql.Dataset")
+    def methods(jar: File, className: String): Seq[String] = {
+      val classLoader: URLClassLoader =
+        new ChildFirstURLClassLoader(Seq(jar.toURI.toURL).toArray, this.getClass.getClassLoader)
+      val mirror = runtimeMirror(classLoader)
+      // scalastyle:off classforname
+      val classSymbol =
+        mirror.classSymbol(Class.forName(className, false, classLoader))
+      // scalastyle:on classforname
+      classSymbol.typeSignature.members
+        .filter(_.isMethod)
+        .map(_.asMethod)
+        .filter(m => m.isPublic)
+        .map(_.fullName)
+        .toSeq
+    }
 
-    val newMethods = clientClass.getMethods
-    val oldMethods = sqlClass.getMethods
+    val className = "org.apache.spark.sql.Dataset"
+    val clientMethods = methods(clientJar, className)
+    val sqlMethods = methods(sqlJar, className)
+    // Exclude some public methods that must be added through `exceptionMethods`
+    val exceptionMethods =
+      Seq("org.apache.spark.sql.Dataset.collectResult", "org.apache.spark.sql.Dataset.plan")
 
-    // For now we simply check the new methods is a subset of the old methods.
-    newMethods
-      .map(m => m.toString)
-      .foreach(method => {
-        assert(oldMethods.map(m => m.toString).contains(method))
-      })
+    // Find new public functions that are not in sql module `Dataset`.
+    clientMethods.diff(sqlMethods).diff(exceptionMethods)
   }
 
   private case class IncludeByName(name: String) extends ProblemFilter {

diff --git a/dev/connect-jvm-client-mima-check b/dev/connect-jvm-client-mima-check
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -o pipefail
+set -e
+
+# Go to the Spark project root directory
+FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
+cd "$FWDIR"
+export SPARK_HOME=$FWDIR
+echo $SPARK_HOME
+
+if [[ -x "$JAVA_HOME/bin/java" ]]; then
+  JAVA_CMD="$JAVA_HOME/bin/java"
+else
+  JAVA_CMD=java
+fi
+
+rm -f .connect-mima-check-result
+
+echo "Build sql module, connect-client-jvm module and connect-client-jvm test jar..."
+build/sbt "sql/package;connect-client-jvm/assembly;connect-client-jvm/Test/package"
+
+CONNECT_TEST_CLASSPATH="$(build/sbt -DcopyDependencies=false "export connect-client-jvm/Test/fullClasspath" | grep jar | tail -n1)"
+CONNECT_CLASSPATH="$(build/sbt -DcopyDependencies=false "export connect-client-jvm/fullClasspath" | grep jar | tail -n1)"
+SQL_CLASSPATH="$(build/sbt -DcopyDependencies=false "export sql/fullClasspath" | grep jar | tail -n1)"
+
+
+echo "Do connect-client-jvm module mima check ..."
+
+$JAVA_CMD \
+  -Xmx2g \
+  -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.util.jar=ALL-UNNAMED \
+  -cp "$CONNECT_CLASSPATH:$CONNECT_TEST_CLASSPATH:$SQL_CLASSPATH" \
+  org.apache.spark.sql.connect.client.CheckConnectJvmClientCompatibility
+
+echo "finish connect-client-jvm module mima check ..."
+
+RESULT_SIZE=$(wc -l .connect-mima-check-result | awk '{print $1}')
+
+# The the file has no content if check passed.
+if [[ $RESULT_SIZE -eq "0" ]]; then
+  ERRORS=""
+else
+  ERRORS=$(grep ERROR .connect-mima-check-result | tail -n1)
+fi
+
+if test ! -z "$ERRORS"; then
+  cat .connect-mima-check-result
+  echo -e "connect-client-jvm module mima check failed."
+  rm .connect-mima-check-result
+  exit 1
+else
+  rm .connect-mima-check-result
+  echo -e "sql and connect-client-jvm module mima check passed."
+fi