Merge branch 'master' into bigquery-relation-provider-v2

GoogleCloudDataproc · May 1, 2024 · 425f879 · 425f879
2 parents b543862 + 881d2cc
commit 425f879
Show file tree

Hide file tree

Showing 15 changed files with 212 additions and 116 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,16 @@
 
 ## Next
 
+* PR #1205: Sending Identity token in the read API header
+* Issue #1195: Support map type with complex value
+* Issue #1215: Support predicate pushdown for DATETIME
+* BigQuery API has been upgraded to version 2.39.0
+* BigQuery Storage API has been upgraded to version 3.5.0
+* GAX has been upgraded to version 2.47.0
+* Arrow has been upgraded to version 16.0.0
+* gRPC has been upgraded to version 1.63.0
+* Netty has been upgraded to version 4.1.109.Final
+
 ## 0.37.0 - 2024-03-25
 
 * :warning: Starting version 0.38.0 of the connector, the `spark-2.4-bigquery` version won't be released as Spark 2.4 is

diff --git a/README-template.md b/README-template.md
@@ -68,12 +68,12 @@ The latest version of the connector is publicly available in the following links
 | Spark 3.3 | `gs://spark-lib/bigquery/spark-3.3-bigquery-${next-release-tag}.jar`([HTTP link](https://storage.googleapis.com/spark-lib/bigquery/spark-3.3-bigquery-${next-release-tag}.jar)) |
 | Spark 3.2 | `gs://spark-lib/bigquery/spark-3.2-bigquery-${next-release-tag}.jar`([HTTP link](https://storage.googleapis.com/spark-lib/bigquery/spark-3.2-bigquery-${next-release-tag}.jar)) |
 | Spark 3.1 | `gs://spark-lib/bigquery/spark-3.1-bigquery-${next-release-tag}.jar`([HTTP link](https://storage.googleapis.com/spark-lib/bigquery/spark-3.1-bigquery-${next-release-tag}.jar)) |
-| Spark 2.4 | `gs://spark-lib/bigquery/spark-2.4-bigquery-${next-release-tag}.jar`([HTTP link](https://storage.googleapis.com/spark-lib/bigquery/spark-2.4-bigquery-${next-release-tag}.jar)) |
+| Spark 2.4 | `gs://spark-lib/bigquery/spark-2.4-bigquery-0.37.0.jar`([HTTP link](https://storage.googleapis.com/spark-lib/bigquery/spark-2.4-bigquery-0.37.0.jar))  |
 | Scala 2.13 | `gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.13-${next-release-tag}.jar` ([HTTP link](https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-with-dependencies_2.13-${next-release-tag}.jar)) |
 | Scala 2.12 | `gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-${next-release-tag}.jar` ([HTTP link](https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-${next-release-tag}.jar)) |
 | Scala 2.11 | `gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.11-0.29.0.jar` ([HTTP link](https://storage.googleapis.com/spark-lib/bigquery/spark-bigquery-with-dependencies_2.11-0.29.0.jar)) |
 
-The first four versions are Java based connectors targeting Spark 2.4/3.1/3.2/3.3 of all Scala versions built on the new
+The first six versions are Java based connectors targeting Spark 2.4/3.1/3.2/3.3/3.4/3.5 of all Scala versions built on the new
 Data Source APIs (Data Source API v2) of Spark.
 
 The final two connectors are Scala based connectors, please use the jar relevant to your Spark installation as outlined
@@ -118,7 +118,7 @@ repository. It can be used using the `--packages` option or the
 | Spark 3.3 | `com.google.cloud.spark:spark-3.3-bigquery:${next-release-tag}` |
 | Spark 3.2 | `com.google.cloud.spark:spark-3.2-bigquery:${next-release-tag}` |
 | Spark 3.1 | `com.google.cloud.spark:spark-3.1-bigquery:${next-release-tag}` |
-| Spark 2.4 | `com.google.cloud.spark:spark-2.4-bigquery:${next-release-tag}` |
+| Spark 2.4 | `com.google.cloud.spark:spark-2.4-bigquery:0.37.0`  |
 | Scala 2.13 | `com.google.cloud.spark:spark-bigquery-with-dependencies_2.13:${next-release-tag}` |
 | Scala 2.12 | `com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:${next-release-tag}` |
 | Scala 2.11 | `com.google.cloud.spark:spark-bigquery-with-dependencies_2.11:0.29.0` |

diff --git a/bigquery-connector-common/pom.xml b/bigquery-connector-common/pom.xml
@@ -103,6 +103,10 @@
  <groupId>org.apache.arrow</groupId>
  <artifactId>arrow-vector</artifactId>
  </dependency>
+ <dependency>
+ <groupId>org.apache.arrow</groupId>
+ <artifactId>arrow-memory-core</artifactId>
+ </dependency>
  <!-- test -->
  <dependency>
  <groupId>com.google.api</groupId>

diff --git a/...-connector-common/src/main/java/com/google/cloud/bigquery/connector/common/ArrowUtil.java b/...-connector-common/src/main/java/com/google/cloud/bigquery/connector/common/ArrowUtil.java
@@ -15,8 +15,8 @@
  */
 package com.google.cloud.bigquery.connector.common;
 
-import org.apache.arrow.memory.NettyAllocationManager;
 import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.memory.netty.NettyAllocationManager;
 
 /** Common utility classes for Arrow. */
 public class ArrowUtil {

diff --git a/cloudbuild/cloudbuild.yaml b/cloudbuild/cloudbuild.yaml
@@ -46,22 +46,10 @@ steps:
  - 'BIGLAKE_CONNECTION_ID=${_BIGLAKE_CONNECTION_ID}'
  - 'BIGQUERY_KMS_KEY_NAME=${_BIGQUERY_KMS_KEY_NAME}'
 
-# 4c. Run integration tests concurrently with unit tests (DSv2, Spark 2.4)
- - name: 'gcr.io/$PROJECT_ID/dataproc-spark-bigquery-connector-presubmit'
- id: 'integration-tests-2.4'
- waitFor: ['integration-tests-2.12']
- entrypoint: 'bash'
- args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest-2.4']
- env:
- - 'GOOGLE_CLOUD_PROJECT=${_GOOGLE_CLOUD_PROJECT}'
- - 'TEMPORARY_GCS_BUCKET=${_TEMPORARY_GCS_BUCKET}'
- - 'BIGLAKE_CONNECTION_ID=${_BIGLAKE_CONNECTION_ID}'
- - 'BIGQUERY_KMS_KEY_NAME=${_BIGQUERY_KMS_KEY_NAME}'
-
-# 4d. Run integration tests concurrently with unit tests (DSv2, Spark 3.1)
+# 4c. Run integration tests concurrently with unit tests (DSv2, Spark 3.1)
  - name: 'gcr.io/$PROJECT_ID/dataproc-spark-bigquery-connector-presubmit'
  id: 'integration-tests-3.1'
- waitFor: ['integration-tests-2.13']
+ waitFor: ['integration-tests-2.12']
  entrypoint: 'bash'
  args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest-3.1']
  env:
@@ -70,10 +58,10 @@ steps:
  - 'BIGLAKE_CONNECTION_ID=${_BIGLAKE_CONNECTION_ID}'
  - 'BIGQUERY_KMS_KEY_NAME=${_BIGQUERY_KMS_KEY_NAME}'
 
-# 4e. Run integration tests concurrently with unit tests (DSv2, Spark 3.2)
+# 4d. Run integration tests concurrently with unit tests (DSv2, Spark 3.2)
  - name: 'gcr.io/$PROJECT_ID/dataproc-spark-bigquery-connector-presubmit'
  id: 'integration-tests-3.2'
- waitFor: ['integration-tests-2.4']
+ waitFor: ['integration-tests-2.13']
  entrypoint: 'bash'
  args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest-3.2']
  env:
@@ -82,7 +70,7 @@ steps:
  - 'BIGLAKE_CONNECTION_ID=${_BIGLAKE_CONNECTION_ID}'
  - 'BIGQUERY_KMS_KEY_NAME=${_BIGQUERY_KMS_KEY_NAME}'
 
-# 4f. Run integration tests concurrently with unit tests (DSv2, Spark 3.3)
+# 4e. Run integration tests concurrently with unit tests (DSv2, Spark 3.3)
  - name: 'gcr.io/$PROJECT_ID/dataproc-spark-bigquery-connector-presubmit'
  id: 'integration-tests-3.3'
  waitFor: ['integration-tests-3.1']
@@ -94,10 +82,10 @@ steps:
  - 'BIGLAKE_CONNECTION_ID=${_BIGLAKE_CONNECTION_ID}'
  - 'BIGQUERY_KMS_KEY_NAME=${_BIGQUERY_KMS_KEY_NAME}'
 
-# 4g. Run integration tests concurrently with unit tests (DSv2, Spark 3.4)
+# 4f. Run integration tests concurrently with unit tests (DSv2, Spark 3.4)
  - name: 'gcr.io/$PROJECT_ID/dataproc-spark-bigquery-connector-presubmit'
  id: 'integration-tests-3.4'
- waitFor: ['integration-tests-3.3']
+ waitFor: ['integration-tests-3.2']
  entrypoint: 'bash'
  args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest-3.4']
  env:
@@ -106,10 +94,10 @@ steps:
  - 'BIGLAKE_CONNECTION_ID=${_BIGLAKE_CONNECTION_ID}'
  - 'BIGQUERY_KMS_KEY_NAME=${_BIGQUERY_KMS_KEY_NAME}'
 
-# 4h. Run integration tests concurrently with unit tests (DSv2, Spark 3.5)
+# 4g. Run integration tests concurrently with unit tests (DSv2, Spark 3.5)
  - name: 'gcr.io/$PROJECT_ID/dataproc-spark-bigquery-connector-presubmit'
  id: 'integration-tests-3.5'
- waitFor: ['integration-tests-3.2']
+ waitFor: ['integration-tests-3.3']
  entrypoint: 'bash'
  args: ['/workspace/cloudbuild/presubmit.sh', 'integrationtest-3.5']
  env:
@@ -121,7 +109,7 @@ steps:
  # 5. Upload coverage to CodeCov
  - name: 'gcr.io/$PROJECT_ID/dataproc-spark-bigquery-connector-presubmit'
  id: 'upload-it-to-codecov'
- waitFor: ['integration-tests-2.12','integration-tests-2.13','integration-tests-2.4','integration-tests-3.1','integration-tests-3.2','integration-tests-3.3', 'integration-tests-3.4', 'integration-tests-3.5']
+ waitFor: ['integration-tests-2.12','integration-tests-2.13','integration-tests-3.1','integration-tests-3.2','integration-tests-3.3', 'integration-tests-3.4', 'integration-tests-3.5']
  entrypoint: 'bash'
  args: ['/workspace/cloudbuild/presubmit.sh', 'upload-it-to-codecov']
  env:

diff --git a/cloudbuild/nightly.sh b/cloudbuild/nightly.sh
@@ -42,9 +42,9 @@ case $STEP in
  #coverage report
  $MVN test jacoco:report jacoco:report-aggregate -Pcoverage,dsv1_2.12,dsv1_2.13,dsv2
  # Run integration tests
- $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate -Pcoverage,integration,dsv1_2.12,dsv1_2.13,dsv2_2.4,dsv2_3.1,dsv2_3.2,dsv2_3.3,dsv2_3.4,dsv2_3.5
+ $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate -Pcoverage,integration,dsv1_2.12,dsv1_2.13,dsv2_3.1,dsv2_3.2,dsv2_3.3,dsv2_3.4,dsv2_3.5
  # Run acceptance tests
- $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate -Pcoverage,acceptance,dsv1_2.12,dsv1_2.13,dsv2_2.4,dsv2_3.1,dsv2_3.2,dsv2_3.3,dsv2_3.4,dsv2_3.5
+ $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate -Pcoverage,acceptance,dsv1_2.12,dsv1_2.13,dsv2_3.1,dsv2_3.2,dsv2_3.3,dsv2_3.4,dsv2_3.5
  # Upload test coverage report to Codecov
  bash <(curl -s https://codecov.io/bash) -K -F "nightly"
 
@@ -64,9 +64,6 @@ case $STEP in
  gsutil cp "${M2REPO}/com/google/cloud/spark/spark-bigquery-with-dependencies_2.13/${BUILD_REVISION}/spark-bigquery-with-dependencies_2.13-${BUILD_REVISION}.jar" "gs://${BUCKET}"
  gsutil cp "gs://${BUCKET}/spark-bigquery-with-dependencies_2.13-${BUILD_REVISION}.jar" "gs://${BUCKET}/spark-bigquery-with-dependencies_2.13-nightly-snapshot.jar"
 
- gsutil cp "${M2REPO}/com/google/cloud/spark/spark-2.4-bigquery/${BUILD_REVISION}/spark-2.4-bigquery-${BUILD_REVISION}.jar" "gs://${BUCKET}"
- gsutil cp "gs://${BUCKET}/spark-2.4-bigquery-${BUILD_REVISION}.jar" "gs://${BUCKET}/spark-2.4-bigquery-nightly-snapshot.jar"
-
  gsutil cp "${M2REPO}/com/google/cloud/spark/spark-3.1-bigquery/${BUILD_REVISION}/spark-3.1-bigquery-${BUILD_REVISION}.jar" "gs://${BUCKET}"
  gsutil cp "gs://${BUCKET}/spark-3.1-bigquery-${BUILD_REVISION}.jar" "gs://${BUCKET}/spark-3.1-bigquery-nightly-snapshot.jar"
 

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
@@ -33,13 +33,13 @@ case $STEP in
  # Download maven and all the dependencies
  init)
  checkenv
- $MVN install -DskipTests -Pdsv1_2.12,dsv1_2.13,dsv2_2.4,dsv2_3.1,dsv2_3.2,dsv2_3.3,dsv2_3.4,dsv2_3.5
+ $MVN install -DskipTests -Pdsv1_2.12,dsv1_2.13,dsv2_3.1,dsv2_3.2,dsv2_3.3,dsv2_3.4,dsv2_3.5
  exit
  ;;
 
  # Run unit tests
  unittest)
- $MVN test jacoco:report jacoco:report-aggregate -Pcoverage,dsv1_2.12,dsv1_2.13,dsv2_2.4,dsv2_3.1,dsv2_3.2,dsv2_3.3,dsv2_3.4,dsv2_3.5
+ $MVN test jacoco:report jacoco:report-aggregate -Pcoverage,dsv1_2.12,dsv1_2.13,dsv2_3.1,dsv2_3.2,dsv2_3.3,dsv2_3.4,dsv2_3.5
  # Upload test coverage report to Codecov
  bash <(curl -s https://codecov.io/bash) -K -F "${STEP}"
  ;;
@@ -54,11 +54,6 @@ case $STEP in
  $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate -Pcoverage,integration,dsv1_2.13
  ;;
 
- # Run integration tests
- integrationtest-2.4)
- $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate -Pcoverage,integration,dsv2_2.4
- ;;
-
  # Run integration tests
  integrationtest-3.1)
  $MVN failsafe:integration-test failsafe:verify jacoco:report jacoco:report-aggregate -Pcoverage,integration,dsv2_3.1

diff --git a/...uery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkFilterUtils.java b/...uery-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkFilterUtils.java
@@ -23,6 +23,8 @@
 import java.sql.Timestamp;
 import java.time.Instant;
 import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
 import java.util.Arrays;
 import java.util.Map;
 import java.util.Optional;
@@ -332,6 +334,10 @@ static String compileValue(Object value, char arrayStart, char arrayEnd) {
  // Instant uses ISO-8601 representation.
  return "TIMESTAMP '" + instant.toString() + "'";
  }
+ if (value instanceof LocalDateTime) {
+ DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSSSSS");
+ return "DATETIME '" + ((LocalDateTime) value).format(formatter) + "'";
+ }
  if (value instanceof Object[]) {
  return Arrays.stream((Object[]) value)
  .map(SparkFilterUtils::compileValue)

diff --git a/...ry-connector-common/src/main/java/com/google/cloud/spark/bigquery/metrics/DataOrigin.java b/...ry-connector-common/src/main/java/com/google/cloud/spark/bigquery/metrics/DataOrigin.java
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.spark.bigquery.metrics;
+
+public enum DataOrigin {
+ NOT_SPECIFIED,
+ TABLE,
+ BIGLAKE,
+ VIEW,
+ QUERY
+}