apache · zhenlineo · Dec 22, 2022 · Jan 25, 2023 · Jan 26, 2023 · Jan 31, 2023
diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml
@@ -33,6 +33,7 @@
   <properties>
     <sbt.project.name>connect-client-jvm</sbt.project.name>
     <guava.version>31.0.1-jre</guava.version>
+    <mima.version>1.1.0</mima.version>
   </properties>
 
   <dependencies>
@@ -92,6 +93,13 @@
       <artifactId>mockito-core</artifactId>
       <scope>test</scope>
     </dependency>
+    <!-- Use mima to perform the compatibility check -->
+    <dependency>
+      <groupId>com.typesafe</groupId>
+      <artifactId>mima-core_${scala.binary.version}</artifactId>
+      <version>${mima.version}</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Column.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Column.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 import scala.collection.JavaConverters._
 
 import org.apache.spark.connect.proto
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.Column.fn
 import org.apache.spark.sql.connect.client.unsupported
 import org.apache.spark.sql.functions.lit
@@ -44,7 +45,7 @@ import org.apache.spark.sql.functions.lit
  *
  * @since 3.4.0
  */
-class Column private[sql] (private[sql] val expr: proto.Expression) {
+class Column private[sql] (private[sql] val expr: proto.Expression) extends Logging {
 class Column(val expr: Expression) extends Logging { 
 class Column(val expr: Expression) extends Logging { 
 
   /**
    * Sum of this expression and another expression.
@@ -80,7 +81,7 @@ class Column private[sql] (private[sql] val expr: proto.Expression) {
   }
 }
 
-object Column {
+private[sql] object Column {
 
   def apply(name: String): Column = Column { builder =>
     name match {

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -21,7 +21,8 @@ import scala.collection.JavaConverters._
 import org.apache.spark.connect.proto
 import org.apache.spark.sql.connect.client.SparkResult
 
-class Dataset(val session: SparkSession, private[sql] val plan: proto.Plan) {
+class Dataset[T] private[sql] (val session: SparkSession, private[sql] val plan: proto.Plan)
+    extends Serializable {
 
   /**
    * Selects a set of column based expressions.
@@ -33,7 +34,7 @@ class Dataset(val session: SparkSession, private[sql] val plan: proto.Plan) {
    * @since 3.4.0
    */
   @scala.annotation.varargs
-  def select(cols: Column*): Dataset = session.newDataset { builder =>
+  def select(cols: Column*): DataFrame = session.newDataset { builder =>
     builder.getProjectBuilder
       .setInput(plan.getRoot)
       .addAllExpressions(cols.map(_.expr).asJava)
@@ -50,7 +51,7 @@ class Dataset(val session: SparkSession, private[sql] val plan: proto.Plan) {
    * @group typedrel
    * @since 3.4.0
    */
-  def filter(condition: Column): Dataset = session.newDataset { builder =>
+  def filter(condition: Column): Dataset[T] = session.newDataset { builder =>
     builder.getFilterBuilder.setInput(plan.getRoot).setCondition(condition.expr)
   }
 
@@ -62,7 +63,7 @@ class Dataset(val session: SparkSession, private[sql] val plan: proto.Plan) {
    * @group typedrel
    * @since 3.4.0
    */
-  def limit(n: Int): Dataset = session.newDataset { builder =>
+  def limit(n: Int): Dataset[T] = session.newDataset { builder =>
     builder.getLimitBuilder
       .setInput(plan.getRoot)
       .setLimit(n)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -16,9 +16,12 @@
  */
 package org.apache.spark.sql
 
+import java.io.Closeable
+
 import org.apache.arrow.memory.RootAllocator
 
 import org.apache.spark.connect.proto
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.connect.client.{SparkConnectClient, SparkResult}
 import org.apache.spark.sql.connect.client.util.Cleaner
 
@@ -43,7 +46,9 @@ import org.apache.spark.sql.connect.client.util.Cleaner
  * }}}
  */
 class SparkSession(private val client: SparkConnectClient, private val cleaner: Cleaner)
-    extends AutoCloseable {
+    extends Serializable
+    with Closeable
+    with Logging {
 
   private[this] val allocator = new RootAllocator()
 
@@ -53,7 +58,7 @@ class SparkSession(private val client: SparkConnectClient, private val cleaner:
    *
    * @since 3.4.0
    */
-  def sql(query: String): Dataset = newDataset { builder =>
+  def sql(query: String): DataFrame = newDataset { builder =>
     builder.setSql(proto.SQL.newBuilder().setQuery(query))
   }
 
@@ -63,15 +68,15 @@ class SparkSession(private val client: SparkConnectClient, private val cleaner:
    *
    * @since 3.4.0
    */
-  def range(end: Long): Dataset = range(0, end)
+  def range(end: Long): Dataset[java.lang.Long] = range(0, end)
 
   /**
    * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements in a
    * range from `start` to `end` (exclusive) with step value 1.
    *
    * @since 3.4.0
    */
-  def range(start: Long, end: Long): Dataset = {
+  def range(start: Long, end: Long): Dataset[java.lang.Long] = {
     range(start, end, step = 1)
   }
 
@@ -81,7 +86,7 @@ class SparkSession(private val client: SparkConnectClient, private val cleaner:
    *
    * @since 3.4.0
    */
-  def range(start: Long, end: Long, step: Long): Dataset = {
+  def range(start: Long, end: Long, step: Long): Dataset[java.lang.Long] = {
     range(start, end, step, None)
   }
 
@@ -91,11 +96,15 @@ class SparkSession(private val client: SparkConnectClient, private val cleaner:
    *
    * @since 3.4.0
    */
-  def range(start: Long, end: Long, step: Long, numPartitions: Int): Dataset = {
+  def range(start: Long, end: Long, step: Long, numPartitions: Int): Dataset[java.lang.Long] = {
     range(start, end, step, Option(numPartitions))
   }
 
-  private def range(start: Long, end: Long, step: Long, numPartitions: Option[Int]): Dataset = {
+  private def range(
+      start: Long,
+      end: Long,
+      step: Long,
+      numPartitions: Option[Int]): Dataset[java.lang.Long] = {
     newDataset { builder =>
       val rangeBuilder = builder.getRangeBuilder
         .setStart(start)
@@ -105,11 +114,11 @@ class SparkSession(private val client: SparkConnectClient, private val cleaner:
     }
   }
 
-  private[sql] def newDataset(f: proto.Relation.Builder => Unit): Dataset = {
+  private[sql] def newDataset[T](f: proto.Relation.Builder => Unit): Dataset[T] = {
     val builder = proto.Relation.newBuilder()
     f(builder)
     val plan = proto.Plan.newBuilder().setRoot(builder).build()
-    new Dataset(this, plan)
+    new Dataset[T](this, plan)
   }
 
   private[sql] def analyze(plan: proto.Plan): proto.AnalyzePlanResponse =
@@ -130,7 +139,7 @@ class SparkSession(private val client: SparkConnectClient, private val cleaner:
 
 // The minimal builder needed to create a spark session.
 // TODO: implements all methods mentioned in the scaladoc of [[SparkSession]]
-object SparkSession {
+object SparkSession extends Logging {
   def builder(): Builder = new Builder()
 
   private lazy val cleaner = {
@@ -139,7 +148,7 @@ object SparkSession {
     cleaner
   }
 
-  class Builder() {
+  class Builder() extends Logging {
     private var _client = SparkConnectClient.builder().build()
 
     def client(client: SparkConnectClient): Builder = {

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/package.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/package.scala
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+package object sql {
+  type DataFrame = Dataset[Row]
+}
diff --git a/...ct/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CompatibilitySuite.scala b/...ct/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CompatibilitySuite.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client
+
+import java.io.File
+import java.net.URLClassLoader
+import java.util.regex.Pattern
+
+import com.typesafe.tools.mima.core._
+import com.typesafe.tools.mima.lib.MiMaLib
+import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite
+import org.apache.spark.sql.connect.client.util.IntegrationTestUtils._
+
+/**
+ * This test checks the binary compatibility of the connect client API against the spark SQL API
+ * using MiMa. We did not write this check using a SBT build rule as the rule cannot provide the
+ * same level of freedom as a test. With a test we can:
+ *   1. Specify any two jars to run the compatibility check.
+ *   1. Easily make the test automatically pick up all new methods added while the client is being
+ *      built.
+ *
+ * The test requires the following artifacts built before running:
+ * {{{
+ *     spark-sql
+ *     spark-connect-client-jvm
+ * }}}
+ * To build the above artifact, use e.g. `sbt package` or `mvn clean install -DskipTests`.
+ *
+ * When debugging this test, if any changes to the client API, the client jar need to be built
+ * before running the test. An example workflow with SBT for this test:
+ *   1. Compatibility test has reported an unexpected client API change.
+ *   1. Fix the wrong client API.
+ *   1. Build the client jar: `sbt package`
+ *   1. Run the test again: `sbt "testOnly
+ *      org.apache.spark.sql.connect.client.CompatibilitySuite"`
+ */
+class CompatibilitySuite extends AnyFunSuite { // scalastyle:ignore funsuite
+
+  private lazy val clientJar: File =
+    findJar(
+      "connector/connect/client/jvm",
+      "spark-connect-client-jvm-assembly",
+      "spark-connect-client-jvm")
+
+  private lazy val sqlJar: File = findJar("sql/core", "spark-sql", "spark-sql")
+
+  /**
+   * MiMa takes an old jar (sql jar) and a new jar (client jar) as inputs and then reports all
+   * incompatibilities found in the new jar. The incompatibility result is then filtered using
+   * include and exclude rules. Include rules are first applied to find all client classes that
+   * need to be checked. Then exclude rules are applied to filter out all unsupported methods in
+   * the client classes.
+   */
+  test("compatibility MiMa tests") {
+    val mima = new MiMaLib(Seq(clientJar, sqlJar))
+    val allProblems = mima.collectProblems(sqlJar, clientJar, List.empty)
+    val includedRules = Seq(
+      IncludeByName("org.apache.spark.sql.Column"),
+      IncludeByName("org.apache.spark.sql.Column$"),
+      IncludeByName("org.apache.spark.sql.Dataset"),
+      // TODO(SPARK-42175) Add the Dataset object definition
+      // IncludeByName("org.apache.spark.sql.Dataset$"),
+      IncludeByName("org.apache.spark.sql.DataFrame"),
+      IncludeByName("org.apache.spark.sql.SparkSession"),
+      IncludeByName("org.apache.spark.sql.SparkSession$")) ++ includeImplementedMethods(clientJar)
+    val excludeRules = Seq(
+      // Filter unsupported rules:
+      // Two sql overloading methods are marked experimental in the API and skipped in the client.
+      ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.sql"),
+      // Skip all shaded dependencies in the client.
+      ProblemFilters.exclude[Problem]("org.sparkproject.*"),
+      ProblemFilters.exclude[Problem]("org.apache.spark.connect.proto.*"))
+    val problems = allProblems
+      .filter { p =>
+        includedRules.exists(rule => rule(p))
+      }
+      .filter { p =>
+        excludeRules.forall(rule => rule(p))
+      }
+
+    if (problems.nonEmpty) {
+      fail(
+        s"\nComparing client jar: $clientJar\nand sql jar: $sqlJar\n" +
+          problems.map(p => p.description("client")).mkString("\n"))
+    }
+  }
+
+  test("compatibility API tests: Dataset") {
+    val clientClassLoader: URLClassLoader = new URLClassLoader(Seq(clientJar.toURI.toURL).toArray)
+    val sqlClassLoader: URLClassLoader = new URLClassLoader(Seq(sqlJar.toURI.toURL).toArray)
+
+    val clientClass = clientClassLoader.loadClass("org.apache.spark.sql.Dataset")
+    val sqlClass = sqlClassLoader.loadClass("org.apache.spark.sql.Dataset")
+
+    val newMethods = clientClass.getMethods
+    val oldMethods = sqlClass.getMethods
+
+    // For now we simply check the new methods is a subset of the old methods.
+    newMethods
+      .map(m => m.toString)
+      .foreach(method => {
+        assert(oldMethods.map(m => m.toString).contains(method))
+      })
+  }
+
+  /**
+   * Find all methods that are implemented in the client jar. Once all major methods are
+   * implemented we can switch to include all methods under the class using ".*" e.g.
+   * "org.apache.spark.sql.Dataset.*"
+   */
+  private def includeImplementedMethods(clientJar: File): Seq[IncludeByName] = {
+    val clsNames = Seq(
+      "org.apache.spark.sql.Column",
+      // TODO(SPARK-42175) Add all overloading methods. Temporarily mute compatibility check for \
+      //  the Dataset methods, as too many overload methods are missing.
+      // "org.apache.spark.sql.Dataset",
+      "org.apache.spark.sql.SparkSession")
+
+    val clientClassLoader: URLClassLoader = new URLClassLoader(Seq(clientJar.toURI.toURL).toArray)
+    clsNames
+      .flatMap { clsName =>
+        val cls = clientClassLoader.loadClass(clsName)
+        // all distinct method names
+        cls.getMethods.map(m => s"$clsName.${m.getName}").toSet
+      }
+      .map { fullName =>
+        IncludeByName(fullName)
+      }
+  }
+
+  private case class IncludeByName(name: String) extends ProblemFilter {
+    private[this] val pattern =
+      Pattern.compile(name.split("\\*", -1).map(Pattern.quote).mkString(".*"))
+
+    override def apply(problem: Problem): Boolean = {
+      pattern.matcher(problem.matchName.getOrElse("")).matches
+    }
+  }
+}