Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: to_gbq uploads ArrowDtype(pa.timestamp(...) without timezone as DATETIME type #832

Merged
merged 2 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pandas_gbq/schema/pyarrow_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@


def arrow_type_to_bigquery_field(name, type_) -> Optional[schema.SchemaField]:
# Since both TIMESTAMP/DATETIME use pyarrow.timestamp(...), we need to use
# a special case to disambiguate them. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/450
if pyarrow.types.is_timestamp(type_):
if type_.tz is None:
return schema.SchemaField(name, "DATETIME")
else:
return schema.SchemaField(name, "TIMESTAMP")

detected_type = _ARROW_SCALAR_IDS_TO_BQ.get(type_.id, None)
if detected_type is not None:
return schema.SchemaField(name, detected_type)
Expand Down
36 changes: 34 additions & 2 deletions tests/unit/schema/test_pyarrow_to_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,53 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

from google.cloud import bigquery
import pyarrow
import pytest

from pandas_gbq.schema import pyarrow_to_bigquery


@pytest.mark.parametrize(
(
"pyarrow_type",
"bigquery_type",
),
(
# All integer types should map to BigQuery INT64 (or INTEGER since
# SchemaField uses the legacy SQL names). See:
# https://github.com/googleapis/python-bigquery-pandas/issues/616
(pyarrow.int8(), "INTEGER"),
(pyarrow.int16(), "INTEGER"),
(pyarrow.int32(), "INTEGER"),
(pyarrow.int64(), "INTEGER"),
(pyarrow.uint8(), "INTEGER"),
(pyarrow.uint16(), "INTEGER"),
(pyarrow.uint32(), "INTEGER"),
(pyarrow.uint64(), "INTEGER"),
# If there is no associated timezone, assume a naive (timezone-less)
# DATETIME. See:
# https://github.com/googleapis/python-bigquery-pandas/issues/450
(pyarrow.timestamp("ns"), "DATETIME"),
(pyarrow.timestamp("ns", tz="UTC"), "TIMESTAMP"),
),
)
def test_arrow_type_to_bigquery_field_scalar_types(pyarrow_type, bigquery_type):
field: bigquery.SchemaField = pyarrow_to_bigquery.arrow_type_to_bigquery_field(
"test_name", pyarrow_type
)
assert field.name == "test_name"
assert field.field_type == bigquery_type


def test_arrow_type_to_bigquery_field_unknown():
# Default types should be picked at a higher layer.
assert (
pyarrow_to_bigquery.arrow_type_to_bigquery_field("test_name", pyarrow.null())
is None
)


def test_arrow_type_to_bigquery_field_list_of_unknown():
# Default types should be picked at a higher layer.
assert (
pyarrow_to_bigquery.arrow_type_to_bigquery_field(
"test_name", pyarrow.list_(pyarrow.null())
Expand Down
Loading