apache · philwalk · Oct 8, 2022 · Oct 10, 2022 · Oct 10, 2022 · Oct 10, 2022
diff --git a/R/check-cran.sh b/R/check-cran.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

diff --git a/R/create-docs.sh b/R/create-docs.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

diff --git a/R/create-rd.sh b/R/create-rd.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

diff --git a/R/find-r.sh b/R/find-r.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

diff --git a/R/install-dev.sh b/R/install-dev.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

diff --git a/R/install-source-package.sh b/R/install-source-package.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

diff --git a/R/run-tests.sh b/R/run-tests.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

diff --git a/bin/spark-class b/bin/spark-class
@@ -77,7 +77,8 @@ set +o posix
 CMD=()
 DELIM=$'\n'
 CMD_START_FLAG="false"
-while IFS= read -d "$DELIM" -r ARG; do
+while IFS= read -d "$DELIM" -r _ARG; do
+  ARG=${_ARG//$'\r'} # if windows, args can have trailing CR
   if [ "$CMD_START_FLAG" == "true" ]; then
     CMD+=("$ARG")
   else

diff --git a/bin/spark-class.cmd b/bin/spark-class.cmd
@@ -22,4 +22,7 @@ rem the environment, it just launches a new cmd to do the real work.
 
 rem The outermost quotes are used to prevent Windows command line parse error
 rem when there are some quotes in parameters, see SPARK-21877.
+
+rem SHELL must be unset in non-SHELL environment
+set SHELL=
 cmd /V /E /C ""%~dp0spark-class2.cmd" %*"
diff --git a/bin/sparkR b/bin/sparkR
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

diff --git a/binder/postBuild b/binder/postBuild
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more

diff --git a/build/spark-build-info b/build/spark-build-info
@@ -24,7 +24,7 @@
 
 RESOURCE_DIR="$1"
 mkdir -p "$RESOURCE_DIR"
-SPARK_BUILD_INFO="${RESOURCE_DIR}"/spark-version-info.properties
+SPARK_BUILD_INFO="${RESOURCE_DIR%/}"/spark-version-info.properties
 
 echo_build_properties() {
   echo version=$1

diff --git a/connector/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java b/connector/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java
@@ -25,6 +25,7 @@
 import org.apache.avro.file.CodecFactory;
 import org.apache.avro.file.DataFileWriter;
 import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericDatumWriter;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.mapred.AvroKey;
 import org.apache.avro.mapreduce.AvroKeyOutputFormat;
@@ -53,7 +54,7 @@ protected RecordWriter<AvroKey<GenericRecord>, NullWritable> create(
         CodecFactory compressionCodec,
         OutputStream outputStream,
         int syncInterval) throws IOException {
-      return new SparkAvroKeyRecordWriter(
+      return new SparkAvroKeyRecordWriter<>(
         writerSchema, dataModel, compressionCodec, outputStream, syncInterval, metadata);
     }
   }
@@ -72,7 +73,7 @@ class SparkAvroKeyRecordWriter<T> extends RecordWriter<AvroKey<T>, NullWritable>
       OutputStream outputStream,
       int syncInterval,
       Map<String, String> metadata) throws IOException {
-    this.mAvroFileWriter = new DataFileWriter(dataModel.createDatumWriter(writerSchema));
+    this.mAvroFileWriter = new DataFileWriter<>(new GenericDatumWriter<>(writerSchema, dataModel));
     for (Map.Entry<String, String> entry : metadata.entrySet()) {
       this.mAvroFileWriter.setMeta(entry.getKey(), entry.getValue());
     }

diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala
@@ -25,7 +25,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.FileSourceOptions
+import org.apache.spark.sql.catalyst.{DataSourceOptions, FileSourceOptions}
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, FailFastMode, ParseMode}
 import org.apache.spark.sql.internal.SQLConf
 
@@ -37,6 +37,8 @@ private[sql] class AvroOptions(
     @transient val conf: Configuration)
   extends FileSourceOptions(parameters) with Logging {
 
+  import AvroOptions._
+
   def this(parameters: Map[String, String], conf: Configuration) = {
     this(CaseInsensitiveMap(parameters), conf)
   }
@@ -54,8 +56,8 @@ private[sql] class AvroOptions(
    * instead of "string" type in the default converted schema.
    */
   val schema: Option[Schema] = {
-    parameters.get("avroSchema").map(new Schema.Parser().setValidateDefaults(false).parse).orElse({
-      val avroUrlSchema = parameters.get("avroSchemaUrl").map(url => {
+    parameters.get(AVRO_SCHEMA).map(new Schema.Parser().setValidateDefaults(false).parse).orElse({
+      val avroUrlSchema = parameters.get(AVRO_SCHEMA_URL).map(url => {
         log.debug("loading avro schema from url: " + url)
         val fs = FileSystem.get(new URI(url), conf)
         val in = fs.open(new Path(url))
@@ -75,20 +77,20 @@ private[sql] class AvroOptions(
    * whose field names do not match. Defaults to false.
    */
   val positionalFieldMatching: Boolean =
-    parameters.get("positionalFieldMatching").exists(_.toBoolean)
+    parameters.get(POSITIONAL_FIELD_MATCHING).exists(_.toBoolean)
 
   /**
    * Top level record name in write result, which is required in Avro spec.
    * See https://avro.apache.org/docs/1.11.1/specification/#schema-record .
    * Default value is "topLevelRecord"
    */
-  val recordName: String = parameters.getOrElse("recordName", "topLevelRecord")
+  val recordName: String = parameters.getOrElse(RECORD_NAME, "topLevelRecord")
 
   /**
    * Record namespace in write result. Default value is "".
    * See Avro spec for details: https://avro.apache.org/docs/1.11.1/specification/#schema-record .
    */
-  val recordNamespace: String = parameters.getOrElse("recordNamespace", "")
+  val recordNamespace: String = parameters.getOrElse(RECORD_NAMESPACE, "")
 
   /**
    * The `ignoreExtension` option controls ignoring of files without `.avro` extensions in read.
@@ -104,7 +106,7 @@ private[sql] class AvroOptions(
       ignoreFilesWithoutExtensionByDefault)
 
     parameters
-      .get(AvroOptions.ignoreExtensionKey)
+      .get(IGNORE_EXTENSION)
       .map(_.toBoolean)
       .getOrElse(!ignoreFilesWithoutExtension)
   }
@@ -116,21 +118,21 @@ private[sql] class AvroOptions(
    * taken into account. If the former one is not set too, the `snappy` codec is used by default.
    */
   val compression: String = {
-    parameters.get("compression").getOrElse(SQLConf.get.avroCompressionCodec)
+    parameters.get(COMPRESSION).getOrElse(SQLConf.get.avroCompressionCodec)
   }
 
   val parseMode: ParseMode =
-    parameters.get("mode").map(ParseMode.fromString).getOrElse(FailFastMode)
+    parameters.get(MODE).map(ParseMode.fromString).getOrElse(FailFastMode)
 
   /**
    * The rebasing mode for the DATE and TIMESTAMP_MICROS, TIMESTAMP_MILLIS values in reads.
    */
   val datetimeRebaseModeInRead: String = parameters
-    .get(AvroOptions.DATETIME_REBASE_MODE)
+    .get(DATETIME_REBASE_MODE)
     .getOrElse(SQLConf.get.getConf(SQLConf.AVRO_REBASE_MODE_IN_READ))
 }
 
-private[sql] object AvroOptions {
+private[sql] object AvroOptions extends DataSourceOptions {
   def apply(parameters: Map[String, String]): AvroOptions = {
     val hadoopConf = SparkSession
       .getActiveSession
@@ -139,11 +141,17 @@ private[sql] object AvroOptions {
     new AvroOptions(CaseInsensitiveMap(parameters), hadoopConf)
   }
 
-  val ignoreExtensionKey = "ignoreExtension"
-
+  val IGNORE_EXTENSION = newOption("ignoreExtension")
+  val MODE = newOption("mode")
+  val RECORD_NAME = newOption("recordName")
+  val COMPRESSION = newOption("compression")
+  val AVRO_SCHEMA = newOption("avroSchema")
+  val AVRO_SCHEMA_URL = newOption("avroSchemaUrl")
+  val RECORD_NAMESPACE = newOption("recordNamespace")
+  val POSITIONAL_FIELD_MATCHING = newOption("positionalFieldMatching")
   // The option controls rebasing of the DATE and TIMESTAMP values between
   // Julian and Proleptic Gregorian calendars. It impacts on the behaviour of the Avro
   // datasource similarly to the SQL config `spark.sql.avro.datetimeRebaseModeInRead`,
   // and can be set to the same values: `EXCEPTION`, `LEGACY` or `CORRECTED`.
-  val DATETIME_REBASE_MODE = "datetimeRebaseMode"
+  val DATETIME_REBASE_MODE = newOption("datetimeRebaseMode")
 }
diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala
@@ -34,7 +34,7 @@ import org.apache.hadoop.mapreduce.Job
 import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.avro.AvroOptions.ignoreExtensionKey
+import org.apache.spark.sql.avro.AvroOptions.IGNORE_EXTENSION
 import org.apache.spark.sql.catalyst.{FileSourceOptions, InternalRow}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.datasources.OutputWriterFactory
@@ -50,8 +50,8 @@ private[sql] object AvroUtils extends Logging {
     val conf = spark.sessionState.newHadoopConfWithOptions(options)
     val parsedOptions = new AvroOptions(options, conf)
 
-    if (parsedOptions.parameters.contains(ignoreExtensionKey)) {
-      logWarning(s"Option $ignoreExtensionKey is deprecated. Please use the " +
+    if (parsedOptions.parameters.contains(IGNORE_EXTENSION)) {
+      logWarning(s"Option $IGNORE_EXTENSION is deprecated. Please use the " +
         "general data source option pathGlobFilter for filtering file names.")
     }
     // User can specify an optional avro json schema.

diff --git a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
@@ -1075,7 +1075,7 @@ abstract class AvroSuite
           .save(s"$tempDir/${UUID.randomUUID()}")
       }.getMessage
       assert(message.contains("Caused by: java.lang.NullPointerException: "))
-      assert(message.contains("null in string in field Name"))
+      assert(message.contains("null value for (non-nullable) string at test_schema.Name"))
     }
   }
 
@@ -1804,13 +1804,13 @@ abstract class AvroSuite
         spark
           .read
           .format("avro")
-          .option(AvroOptions.ignoreExtensionKey, false)
+          .option(AvroOptions.IGNORE_EXTENSION, false)
           .load(dir.getCanonicalPath)
           .count()
       }
       val deprecatedEvents = logAppender.loggingEvents
         .filter(_.getMessage.getFormattedMessage.contains(
-          s"Option ${AvroOptions.ignoreExtensionKey} is deprecated"))
+          s"Option ${AvroOptions.IGNORE_EXTENSION} is deprecated"))
       assert(deprecatedEvents.size === 1)
     }
   }
@@ -2272,6 +2272,20 @@ abstract class AvroSuite
       checkAnswer(df2, df.collect().toSeq)
     }
   }
+
+  test("SPARK-40667: validate Avro Options") {
+    assert(AvroOptions.getAllOptions.size == 9)
+    // Please add validation on any new Avro options here
+    assert(AvroOptions.isValidOption("ignoreExtension"))
+    assert(AvroOptions.isValidOption("mode"))
+    assert(AvroOptions.isValidOption("recordName"))
+    assert(AvroOptions.isValidOption("compression"))
+    assert(AvroOptions.isValidOption("avroSchema"))
+    assert(AvroOptions.isValidOption("avroSchemaUrl"))
+    assert(AvroOptions.isValidOption("recordNamespace"))
+    assert(AvroOptions.isValidOption("positionalFieldMatching"))
+    assert(AvroOptions.isValidOption("datetimeRebaseMode"))
+  }
 }
 
 class AvroV1Suite extends AvroSuite {

diff --git a/connector/connect/dev/generate_protos.sh b/connector/connect/dev/generate_protos.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with

diff --git a/connector/connect/pom.xml b/connector/connect/pom.xml
@@ -268,11 +268,13 @@
                 as assembly build.
               -->
               <include>com.google.android:annotations</include>
-              <include>com.google.api.grpc:proto-google-common-proto</include>
+              <include>com.google.api.grpc:proto-google-common-protos</include>
               <include>io.perfmark:perfmark-api</include>
               <include>org.codehaus.mojo:animal-sniffer-annotations</include>
               <include>com.google.errorprone:error_prone_annotations</include>
               <include>com.google.j2objc:j2objc-annotations</include>
+              <include>org.checkerframework:checker-qual</include>
+              <include>com.google.code.gson:gson</include>
             </includes>
           </artifactSet>
           <relocations>
@@ -303,28 +305,66 @@
             </relocation>
 
             <relocation>
-              <pattern>com.google.android</pattern>
-              <shadedPattern>${spark.shade.packageName}.connect.android</shadedPattern>
+              <pattern>android.annotation</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.android_annotation</shadedPattern>
             </relocation>
             <relocation>
-              <pattern>com.google.api.grpc</pattern>
-              <shadedPattern>${spark.shade.packageName}.connect.api</shadedPattern>
+              <pattern>io.perfmark</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.io_perfmark</shadedPattern>
             </relocation>
             <relocation>
-              <pattern>io.perfmark</pattern>
-              <shadedPattern>${spark.shade.packageName}.connect.perfmark</shadedPattern>
+              <pattern>org.codehaus.mojo.animal_sniffer</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.animal_sniffer</shadedPattern>
+            </relocation>
+            <relocation>
+              <pattern>com.google.j2objc.annotations</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.j2objc_annotations</shadedPattern>
+            </relocation>
+            <relocation>
+              <pattern>com.google.errorprone.annotations</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.errorprone_annotations</shadedPattern>
+            </relocation>
+            <relocation>
+              <pattern>org.checkerframework</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.checkerframework</shadedPattern>
+            </relocation>
+            <relocation>
+              <pattern>com.google.gson</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.gson</shadedPattern>
+            </relocation>
+
+            <!--
+              For `com.google.api.grpc:proto-google-common-protos`, do not directly define pattern
+              as `common.google`, otherwise, otherwise, the relocation result may be uncertain due
+              to the change of rule order.
+            -->
+            <relocation>
+              <pattern>com.google.api</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.google_protos.api</shadedPattern>
+            </relocation>
+            <relocation>
+              <pattern>com.google.cloud</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.google_protos.cloud</shadedPattern>
+            </relocation>
+            <relocation>
+              <pattern>com.google.geo</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.google_protos.geo</shadedPattern>
+            </relocation>
+            <relocation>
+              <pattern>com.google.logging</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.google_protos.logging</shadedPattern>
             </relocation>
             <relocation>
-              <pattern>org.codehaus.mojo</pattern>
-              <shadedPattern>${spark.shade.packageName}.connect.mojo</shadedPattern>
+              <pattern>com.google.longrunning</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.google_protos.longrunning</shadedPattern>
             </relocation>
             <relocation>
-              <pattern>com.google.errorprone</pattern>
-              <shadedPattern>${spark.shade.packageName}.connect.errorprone</shadedPattern>
+              <pattern>com.google.rpc</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.google_protos.rpc</shadedPattern>
             </relocation>
             <relocation>
-              <pattern>com.com.google.j2objc</pattern>
-              <shadedPattern>${spark.shade.packageName}.connect.j2objc</shadedPattern>
+              <pattern>com.google.type</pattern>
+              <shadedPattern>${spark.shade.packageName}.connect.google_protos.type</shadedPattern>
             </relocation>
           </relocations>
         </configuration>

diff --git a/connector/connect/src/main/protobuf/spark/connect/commands.proto b/connector/connect/src/main/protobuf/spark/connect/commands.proto
@@ -44,8 +44,8 @@ message CreateScalarFunction {
   repeated string parts = 1;
   FunctionLanguage language = 2;
   bool temporary = 3;
-  repeated Type argument_types = 4;
-  Type return_type = 5;
+  repeated DataType argument_types = 4;
+  DataType return_type = 5;
 
   // How the function body is defined:
   oneof function_definition {