Skip to content

Commit d120f8f

Browse files
authored
fix: read_gbq supports extreme DATETIME values such as 0001-01-01 00:00:00 (#444)
* fix: read out-of-bounds DATETIME values such as `0001-01-01 00:00:00` deps: require google-cloud-bigquery 1.26.1 or later * feat: accepts a table ID, which downloads the table without a query * revert tests for read_gbq fix which isn't yet resolved * Revert "revert tests for read_gbq fix which isn't yet resolved" This reverts commit 2a76982. * add todo for next steps * add unit test for table ID read_gbq * add helper for is_query * implement read_gbq with table id * fix remaining tests, don't localalize out-of-bounds timestamp columns * Update pandas_gbq/gbq.py * fix 3.7 unit tests * correct coverage * skip coverage for optional test skip * fix docs build * improve test coverage for error case * as of google-cloud-bigquery 1.11.0, get_table before list_rows is unnecessary * refactor tests * add more scalars * add more types * add failing time test * add test for bignumeric * add test for null values * add epoch timestamps to tests * add post-download dtype conversions * add failing test for desired fix * fix the issue with extreme datetimes * fix constraints * fix tests for empty dataframe * fix tests for older google-cloud-bigquery * ignore index on empty dataframe * add db-dtypes to runtime import checks * document dependencies * remove TODO, since done * remove unnecessary special case for empty dataframe Fixes prerelease test run * remove redundant 'deprecated' from comment
1 parent a63ea5d commit d120f8f

11 files changed

+793
-415
lines changed

ci/requirements-3.7-0.24.2.conda

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ db-dtypes==0.3.1
44
fastavro
55
flake8
66
numpy==1.16.6
7-
google-cloud-bigquery==1.11.1
7+
google-cloud-bigquery==1.27.2
8+
google-cloud-bigquery-storage==1.1.0
89
pyarrow==3.0.0
910
pydata-google-auth
1011
pytest

pandas_gbq/features.py

+24
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
BIGQUERY_MINIMUM_VERSION = "1.11.1"
99
BIGQUERY_CLIENT_INFO_VERSION = "1.12.0"
1010
BIGQUERY_BQSTORAGE_VERSION = "1.24.0"
11+
BIGQUERY_ACCURATE_TIMESTAMP_VERSION = "2.6.0"
1112
BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0"
13+
BIGQUERY_SUPPORTS_BIGNUMERIC_VERSION = "2.10.0"
14+
BIGQUERY_NO_DATE_AS_OBJECT_VERSION = "3.0.0dev"
1215
PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0"
1316
PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0"
1417
PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0"
@@ -42,6 +45,13 @@ def bigquery_installed_version(self):
4245

4346
return self._bigquery_installed_version
4447

48+
@property
49+
def bigquery_has_accurate_timestamp(self):
50+
import pkg_resources
51+
52+
min_version = pkg_resources.parse_version(BIGQUERY_ACCURATE_TIMESTAMP_VERSION)
53+
return self.bigquery_installed_version >= min_version
54+
4555
@property
4656
def bigquery_has_client_info(self):
4757
import pkg_resources
@@ -51,6 +61,13 @@ def bigquery_has_client_info(self):
5161
)
5262
return self.bigquery_installed_version >= bigquery_client_info_version
5363

64+
@property
65+
def bigquery_has_bignumeric(self):
66+
import pkg_resources
67+
68+
min_version = pkg_resources.parse_version(BIGQUERY_SUPPORTS_BIGNUMERIC_VERSION)
69+
return self.bigquery_installed_version >= min_version
70+
5471
@property
5572
def bigquery_has_bqstorage(self):
5673
import pkg_resources
@@ -69,6 +86,13 @@ def bigquery_has_from_dataframe_with_csv(self):
6986
)
7087
return self.bigquery_installed_version >= bigquery_from_dataframe_version
7188

89+
@property
90+
def bigquery_needs_date_as_object(self):
91+
import pkg_resources
92+
93+
max_version = pkg_resources.parse_version(BIGQUERY_NO_DATE_AS_OBJECT_VERSION)
94+
return self.bigquery_installed_version < max_version
95+
7296
@property
7397
def pandas_installed_version(self):
7498
import pandas

pandas_gbq/gbq.py

+48-36
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
# Use of this source code is governed by a BSD-style
33
# license that can be found in the LICENSE file.
44

5+
from datetime import datetime
56
import logging
67
import re
78
import time
8-
import warnings
9-
from datetime import datetime
109
import typing
11-
from typing import Any, Dict, Optional, Union
10+
from typing import Any, Dict, Optional, Sequence, Union
11+
import warnings
1212

1313
import numpy as np
1414

@@ -37,21 +37,25 @@
3737
import pandas_gbq.schema
3838
import pandas_gbq.timestamp
3939

40-
41-
logger = logging.getLogger(__name__)
42-
4340
try:
4441
import tqdm # noqa
4542
except ImportError:
4643
tqdm = None
4744

45+
logger = logging.getLogger(__name__)
46+
4847

4948
def _test_google_api_imports():
5049
try:
5150
import pkg_resources # noqa
5251
except ImportError as ex:
5352
raise ImportError("pandas-gbq requires setuptools") from ex
5453

54+
try:
55+
import db_dtypes # noqa
56+
except ImportError as ex:
57+
raise ImportError("pandas-gbq requires db-dtypes") from ex
58+
5559
try:
5660
import pydata_google_auth # noqa
5761
except ImportError as ex:
@@ -546,6 +550,8 @@ def _download_results(
546550
to_dataframe_kwargs = {}
547551
if FEATURES.bigquery_has_bqstorage:
548552
to_dataframe_kwargs["create_bqstorage_client"] = create_bqstorage_client
553+
if FEATURES.bigquery_needs_date_as_object:
554+
to_dataframe_kwargs["date_as_object"] = True
549555

550556
try:
551557
schema_fields = [field.to_api_repr() for field in rows_iter.schema]
@@ -559,11 +565,7 @@ def _download_results(
559565
except self.http_error as ex:
560566
self.process_http_error(ex)
561567

562-
if df.empty:
563-
df = _cast_empty_df_dtypes(schema_fields, df)
564-
565-
# Ensure any TIMESTAMP columns are tz-aware.
566-
df = pandas_gbq.timestamp.localize_df(df, schema_fields)
568+
df = _finalize_dtypes(df, schema_fields)
567569

568570
logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
569571
return df
@@ -617,23 +619,18 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
617619
See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html
618620
#missing-data-casting-rules-and-indexing
619621
"""
622+
import db_dtypes
623+
620624
# If you update this mapping, also update the table at
621625
# `docs/reading.rst`.
622626
dtype_map = {
623-
"DATE": "datetime64[ns]",
624-
"DATETIME": "datetime64[ns]",
625627
"FLOAT": np.dtype(float),
626-
"GEOMETRY": "object",
627628
"INTEGER": "Int64",
628-
"RECORD": "object",
629-
"STRING": "object",
630-
# datetime.time objects cannot be case to datetime64.
631-
# https://github.com/pydata/pandas-gbq/issues/328
632-
"TIME": "object",
633-
# pandas doesn't support timezone-aware dtype in DataFrame/Series
634-
# constructors. It's more idiomatic to localize after construction.
635-
# https://github.com/pandas-dev/pandas/issues/25843
636-
"TIMESTAMP": "datetime64[ns]",
629+
"TIME": db_dtypes.TimeDtype(),
630+
# Note: Other types such as 'datetime64[ns]' and db_types.DateDtype()
631+
# are not included because the pandas range does not align with the
632+
# BigQuery range. We need to attempt a conversion to those types and
633+
# fall back to 'object' when there are out-of-range values.
637634
}
638635

639636
# Amend dtype_map with newer extension types if pandas version allows.
@@ -656,28 +653,43 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
656653
return dtypes
657654

658655

659-
def _cast_empty_df_dtypes(schema_fields, df):
660-
"""Cast any columns in an empty dataframe to correct type.
656+
def _finalize_dtypes(
657+
df: "pandas.DataFrame", schema_fields: Sequence[Dict[str, Any]]
658+
) -> "pandas.DataFrame":
659+
"""
660+
Attempt to change the dtypes of those columns that don't map exactly.
661661
662-
In an empty dataframe, pandas cannot choose a dtype unless one is
663-
explicitly provided. The _bqschema_to_nullsafe_dtypes() function only
664-
provides dtypes when the dtype safely handles null values. This means
665-
that empty int64 and boolean columns are incorrectly classified as
666-
``object``.
662+
For example db_dtypes.DateDtype() and datetime64[ns] cannot represent
663+
0001-01-01, but they can represent dates within a couple hundred years of
664+
1970. See:
665+
https://github.com/googleapis/python-bigquery-pandas/issues/365
667666
"""
668-
if not df.empty:
669-
raise ValueError("DataFrame must be empty in order to cast non-nullsafe dtypes")
667+
import db_dtypes
668+
import pandas.api.types
670669

671-
dtype_map = {"BOOLEAN": bool, "INTEGER": np.int64}
670+
# If you update this mapping, also update the table at
671+
# `docs/reading.rst`.
672+
dtype_map = {
673+
"DATE": db_dtypes.DateDtype(),
674+
"DATETIME": "datetime64[ns]",
675+
"TIMESTAMP": "datetime64[ns]",
676+
}
672677

673678
for field in schema_fields:
674-
column = str(field["name"])
679+
# This method doesn't modify ARRAY/REPEATED columns.
675680
if field["mode"].upper() == "REPEATED":
676681
continue
677682

683+
name = str(field["name"])
678684
dtype = dtype_map.get(field["type"].upper())
679-
if dtype:
680-
df[column] = df[column].astype(dtype)
685+
686+
# Avoid deprecated conversion to timezone-naive dtype by only casting
687+
# object dtypes.
688+
if dtype and pandas.api.types.is_object_dtype(df[name]):
689+
df[name] = df[name].astype(dtype, errors="ignore")
690+
691+
# Ensure any TIMESTAMP columns are tz-aware.
692+
df = pandas_gbq.timestamp.localize_df(df, schema_fields)
681693

682694
return df
683695

pandas_gbq/timestamp.py

-5
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,6 @@ def localize_df(df, schema_fields):
3030
pandas.DataFrame
3131
DataFrame with localized TIMESTAMP columns.
3232
"""
33-
if len(df.index) == 0:
34-
# If there are no rows, there is nothing to do.
35-
# Fix for https://github.com/pydata/pandas-gbq/issues/299
36-
return df
37-
3833
for field in schema_fields:
3934
column = str(field["name"])
4035
if "mode" in field and field["mode"].upper() == "REPEATED":

setup.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,19 @@
2828
"pandas >=0.24.2",
2929
"pyarrow >=3.0.0, <7.0dev",
3030
"pydata-google-auth",
31-
"google-api-core >=1.14.0",
32-
"google-auth >=1.4.1",
31+
# Note: google-api-core and google-auth are also included via transitive
32+
# dependency on google-cloud-bigquery, but this library also uses them
33+
# directly.
34+
"google-api-core >=1.21.0",
35+
"google-auth >=1.18.0",
3336
"google-auth-oauthlib >=0.0.1",
34-
# 2.4.* has a bug where waiting for the query can hang indefinitely.
35-
# https://github.com/pydata/pandas-gbq/issues/343
36-
"google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<4.0.0dev,!=2.4.*",
37+
# Require 1.27.* because it has a fix for out-of-bounds timestamps. See:
38+
# https://github.com/googleapis/python-bigquery/pull/209 and
39+
# https://github.com/googleapis/python-bigquery-pandas/issues/365
40+
# Exclude 2.4.* because it has a bug where waiting for the query can hang
41+
# indefinitely. https://github.com/pydata/pandas-gbq/issues/343
42+
"google-cloud-bigquery >=1.27.2,<4.0.0dev,!=2.4.*",
43+
"google-cloud-bigquery-storage >=1.1.0,<3.0.0dev",
3744
]
3845
extras = {
3946
"tqdm": "tqdm>=4.23.0",

testing/constraints-3.7.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
77
# Then this file should have foo==1.14.0
88
db-dtypes==0.3.1
9-
google-api-core==1.14.0
10-
google-auth==1.4.1
9+
google-api-core==1.21.0
10+
google-auth==1.18.0
1111
google-auth-oauthlib==0.0.1
12-
google-cloud-bigquery==1.11.1
12+
google-cloud-bigquery==1.27.2
1313
google-cloud-bigquery-storage==1.1.0
1414
numpy==1.16.6
1515
pandas==0.24.2

0 commit comments

Comments
 (0)