From 1e1c77f5bcdf2eae25226997d8b9d828ac3f2867 Mon Sep 17 00:00:00 2001 From: Tim Swena Date: Tue, 10 Dec 2024 13:00:04 -0600 Subject: [PATCH 1/2] fix!: `to_gbq` uploads `ArrowDtype(pa.timestamp(...)` without timezone as `DATETIME` type Release-As: 0.25.0 --- pandas_gbq/schema/pyarrow_to_bigquery.py | 10 ++++++ tests/unit/schema/test_pyarrow_to_bigquery.py | 34 +++++++++++++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py index c63559eb..6d92851f 100644 --- a/pandas_gbq/schema/pyarrow_to_bigquery.py +++ b/pandas_gbq/schema/pyarrow_to_bigquery.py @@ -38,6 +38,16 @@ def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: + # Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use + # a special case to disambiguate them. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/450 + if pyarrow.types.is_timestamp(type_): + if type_.tz is None: + return schema.SchemaField(name, "DATETIME") + else: + return schema.SchemaField(name, "TIMESTAMP") + + detected_type = _ARROW_SCALAR_IDS_TO_BQ.get(type_.id, None) if detected_type is not None: return schema.SchemaField(name, detected_type) diff --git a/tests/unit/schema/test_pyarrow_to_bigquery.py b/tests/unit/schema/test_pyarrow_to_bigquery.py index 9a20e342..96366e8f 100644 --- a/tests/unit/schema/test_pyarrow_to_bigquery.py +++ b/tests/unit/schema/test_pyarrow_to_bigquery.py @@ -3,12 +3,43 @@ # license that can be found in the LICENSE file. import pyarrow +import pytest +from google.cloud import bigquery from pandas_gbq.schema import pyarrow_to_bigquery +@pytest.mark.parametrize( + ( + "pyarrow_type", + "bigquery_type", + ), + ( + # All integer types should map to BigQuery INT64 (or INTEGER since + # SchemaField uses the legacy SQL names). See: + # https://github.com/googleapis/python-bigquery-pandas/issues/616 + (pyarrow.int8(), "INTEGER"), + (pyarrow.int16(), "INTEGER"), + (pyarrow.int32(), "INTEGER"), + (pyarrow.int64(), "INTEGER"), + (pyarrow.uint8(), "INTEGER"), + (pyarrow.uint16(), "INTEGER"), + (pyarrow.uint32(), "INTEGER"), + (pyarrow.uint64(), "INTEGER"), + # If there is no associated timezone, assume a naive (timezone-less) + # DATETIME. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/450 + (pyarrow.timestamp("ns"), "DATETIME"), + (pyarrow.timestamp("ns", tz="UTC"), "TIMESTAMP"), + ), +) +def test_arrow_type_to_bigquery_field_scalar_types(pyarrow_type, bigquery_type): + field: bigquery.SchemaField = pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow_type) + assert field.name == "test_name" + assert field.field_type == bigquery_type + + def test_arrow_type_to_bigquery_field_unknown(): - # Default types should be picked at a higher layer. assert ( pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null()) is None @@ -16,7 +47,6 @@ def test_arrow_type_to_bigquery_field_unknown(): def test_arrow_type_to_bigquery_field_list_of_unknown(): - # Default types should be picked at a higher layer. assert ( pyarrow_to_bigquery.arrow_type_to_bigquery_field( "test_name", pyarrow.list_(pyarrow.null()) From 9f776292fff619e4e845c566c4f7661cc88a0cce Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Tue, 10 Dec 2024 19:03:12 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- pandas_gbq/schema/pyarrow_to_bigquery.py | 1 - tests/unit/schema/test_pyarrow_to_bigquery.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py index 6d92851f..da1a1ce8 100644 --- a/pandas_gbq/schema/pyarrow_to_bigquery.py +++ b/pandas_gbq/schema/pyarrow_to_bigquery.py @@ -47,7 +47,6 @@ def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]: else: return schema.SchemaField(name, "TIMESTAMP") - detected_type = _ARROW_SCALAR_IDS_TO_BQ.get(type_.id, None) if detected_type is not None: return schema.SchemaField(name, detected_type) diff --git a/tests/unit/schema/test_pyarrow_to_bigquery.py b/tests/unit/schema/test_pyarrow_to_bigquery.py index 96366e8f..4af0760f 100644 --- a/tests/unit/schema/test_pyarrow_to_bigquery.py +++ b/tests/unit/schema/test_pyarrow_to_bigquery.py @@ -2,10 +2,10 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +from google.cloud import bigquery import pyarrow import pytest -from google.cloud import bigquery from pandas_gbq.schema import pyarrow_to_bigquery @@ -34,7 +34,9 @@ ), ) def test_arrow_type_to_bigquery_field_scalar_types(pyarrow_type, bigquery_type): - field: bigquery.SchemaField = pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow_type) + field: bigquery.SchemaField = pyarrow_to_bigquery.arrow_type_to_bigquery_field( + "test_name", pyarrow_type + ) assert field.name == "test_name" assert field.field_type == bigquery_type