Skip to content

Commit

Permalink
fix: read_gbq supports extreme DATETIME values such as `0001-01-01 …
Browse files Browse the repository at this point in the history
…00:00:00` (#444)

* fix: read out-of-bounds DATETIME values such as `0001-01-01 00:00:00`

deps: require google-cloud-bigquery 1.26.1 or later

* feat:  accepts a table ID, which downloads the table without a query

* revert tests for read_gbq fix which isn't yet resolved

* Revert "revert tests for read_gbq fix which isn't yet resolved"

This reverts commit 2a76982.

* add todo for next steps

* add unit test for table ID read_gbq

* add helper for is_query

* implement read_gbq with table id

* fix remaining tests, don't localalize out-of-bounds timestamp columns

* Update pandas_gbq/gbq.py

* fix 3.7 unit tests

* correct coverage

* skip coverage for optional test skip

* fix  docs build

* improve test coverage for error case

* as of google-cloud-bigquery 1.11.0, get_table before list_rows is unnecessary

* refactor tests

* add more scalars

* add more types

* add failing time test

* add test for bignumeric

* add test for null values

* add epoch timestamps to tests

* add post-download dtype conversions

* add failing test for desired fix

* fix the issue with extreme datetimes

* fix constraints

* fix tests for empty dataframe

* fix tests for older google-cloud-bigquery

* ignore index on empty dataframe

* add db-dtypes to runtime import checks

* document dependencies

* remove TODO, since done

* remove unnecessary special case for empty dataframe

Fixes prerelease test run

* remove redundant 'deprecated' from comment
  • Loading branch information
tswast authored Jan 5, 2022
1 parent a63ea5d commit d120f8f
Show file tree
Hide file tree
Showing 11 changed files with 793 additions and 415 deletions.
3 changes: 2 additions & 1 deletion ci/requirements-3.7-0.24.2.conda
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ db-dtypes==0.3.1
fastavro
flake8
numpy==1.16.6
google-cloud-bigquery==1.11.1
google-cloud-bigquery==1.27.2
google-cloud-bigquery-storage==1.1.0
pyarrow==3.0.0
pydata-google-auth
pytest
Expand Down
24 changes: 24 additions & 0 deletions pandas_gbq/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
BIGQUERY_MINIMUM_VERSION = "1.11.1"
BIGQUERY_CLIENT_INFO_VERSION = "1.12.0"
BIGQUERY_BQSTORAGE_VERSION = "1.24.0"
BIGQUERY_ACCURATE_TIMESTAMP_VERSION = "2.6.0"
BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0"
BIGQUERY_SUPPORTS_BIGNUMERIC_VERSION = "2.10.0"
BIGQUERY_NO_DATE_AS_OBJECT_VERSION = "3.0.0dev"
PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0"
PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0"
PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0"
Expand Down Expand Up @@ -42,6 +45,13 @@ def bigquery_installed_version(self):

return self._bigquery_installed_version

@property
def bigquery_has_accurate_timestamp(self):
import pkg_resources

min_version = pkg_resources.parse_version(BIGQUERY_ACCURATE_TIMESTAMP_VERSION)
return self.bigquery_installed_version >= min_version

@property
def bigquery_has_client_info(self):
import pkg_resources
Expand All @@ -51,6 +61,13 @@ def bigquery_has_client_info(self):
)
return self.bigquery_installed_version >= bigquery_client_info_version

@property
def bigquery_has_bignumeric(self):
import pkg_resources

min_version = pkg_resources.parse_version(BIGQUERY_SUPPORTS_BIGNUMERIC_VERSION)
return self.bigquery_installed_version >= min_version

@property
def bigquery_has_bqstorage(self):
import pkg_resources
Expand All @@ -69,6 +86,13 @@ def bigquery_has_from_dataframe_with_csv(self):
)
return self.bigquery_installed_version >= bigquery_from_dataframe_version

@property
def bigquery_needs_date_as_object(self):
import pkg_resources

max_version = pkg_resources.parse_version(BIGQUERY_NO_DATE_AS_OBJECT_VERSION)
return self.bigquery_installed_version < max_version

@property
def pandas_installed_version(self):
import pandas
Expand Down
84 changes: 48 additions & 36 deletions pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

from datetime import datetime
import logging
import re
import time
import warnings
from datetime import datetime
import typing
from typing import Any, Dict, Optional, Union
from typing import Any, Dict, Optional, Sequence, Union
import warnings

import numpy as np

Expand Down Expand Up @@ -37,21 +37,25 @@
import pandas_gbq.schema
import pandas_gbq.timestamp


logger = logging.getLogger(__name__)

try:
import tqdm # noqa
except ImportError:
tqdm = None

logger = logging.getLogger(__name__)


def _test_google_api_imports():
try:
import pkg_resources # noqa
except ImportError as ex:
raise ImportError("pandas-gbq requires setuptools") from ex

try:
import db_dtypes # noqa
except ImportError as ex:
raise ImportError("pandas-gbq requires db-dtypes") from ex

try:
import pydata_google_auth # noqa
except ImportError as ex:
Expand Down Expand Up @@ -546,6 +550,8 @@ def _download_results(
to_dataframe_kwargs = {}
if FEATURES.bigquery_has_bqstorage:
to_dataframe_kwargs["create_bqstorage_client"] = create_bqstorage_client
if FEATURES.bigquery_needs_date_as_object:
to_dataframe_kwargs["date_as_object"] = True

try:
schema_fields = [field.to_api_repr() for field in rows_iter.schema]
Expand All @@ -559,11 +565,7 @@ def _download_results(
except self.http_error as ex:
self.process_http_error(ex)

if df.empty:
df = _cast_empty_df_dtypes(schema_fields, df)

# Ensure any TIMESTAMP columns are tz-aware.
df = pandas_gbq.timestamp.localize_df(df, schema_fields)
df = _finalize_dtypes(df, schema_fields)

logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
return df
Expand Down Expand Up @@ -617,23 +619,18 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html
#missing-data-casting-rules-and-indexing
"""
import db_dtypes

# If you update this mapping, also update the table at
# `docs/reading.rst`.
dtype_map = {
"DATE": "datetime64[ns]",
"DATETIME": "datetime64[ns]",
"FLOAT": np.dtype(float),
"GEOMETRY": "object",
"INTEGER": "Int64",
"RECORD": "object",
"STRING": "object",
# datetime.time objects cannot be case to datetime64.
# https://github.com/pydata/pandas-gbq/issues/328
"TIME": "object",
# pandas doesn't support timezone-aware dtype in DataFrame/Series
# constructors. It's more idiomatic to localize after construction.
# https://github.com/pandas-dev/pandas/issues/25843
"TIMESTAMP": "datetime64[ns]",
"TIME": db_dtypes.TimeDtype(),
# Note: Other types such as 'datetime64[ns]' and db_types.DateDtype()
# are not included because the pandas range does not align with the
# BigQuery range. We need to attempt a conversion to those types and
# fall back to 'object' when there are out-of-range values.
}

# Amend dtype_map with newer extension types if pandas version allows.
Expand All @@ -656,28 +653,43 @@ def _bqschema_to_nullsafe_dtypes(schema_fields):
return dtypes


def _cast_empty_df_dtypes(schema_fields, df):
"""Cast any columns in an empty dataframe to correct type.
def _finalize_dtypes(
df: "pandas.DataFrame", schema_fields: Sequence[Dict[str, Any]]
) -> "pandas.DataFrame":
"""
Attempt to change the dtypes of those columns that don't map exactly.
In an empty dataframe, pandas cannot choose a dtype unless one is
explicitly provided. The _bqschema_to_nullsafe_dtypes() function only
provides dtypes when the dtype safely handles null values. This means
that empty int64 and boolean columns are incorrectly classified as
``object``.
For example db_dtypes.DateDtype() and datetime64[ns] cannot represent
0001-01-01, but they can represent dates within a couple hundred years of
1970. See:
https://github.com/googleapis/python-bigquery-pandas/issues/365
"""
if not df.empty:
raise ValueError("DataFrame must be empty in order to cast non-nullsafe dtypes")
import db_dtypes
import pandas.api.types

dtype_map = {"BOOLEAN": bool, "INTEGER": np.int64}
# If you update this mapping, also update the table at
# `docs/reading.rst`.
dtype_map = {
"DATE": db_dtypes.DateDtype(),
"DATETIME": "datetime64[ns]",
"TIMESTAMP": "datetime64[ns]",
}

for field in schema_fields:
column = str(field["name"])
# This method doesn't modify ARRAY/REPEATED columns.
if field["mode"].upper() == "REPEATED":
continue

name = str(field["name"])
dtype = dtype_map.get(field["type"].upper())
if dtype:
df[column] = df[column].astype(dtype)

# Avoid deprecated conversion to timezone-naive dtype by only casting
# object dtypes.
if dtype and pandas.api.types.is_object_dtype(df[name]):
df[name] = df[name].astype(dtype, errors="ignore")

# Ensure any TIMESTAMP columns are tz-aware.
df = pandas_gbq.timestamp.localize_df(df, schema_fields)

return df

Expand Down
5 changes: 0 additions & 5 deletions pandas_gbq/timestamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@ def localize_df(df, schema_fields):
pandas.DataFrame
DataFrame with localized TIMESTAMP columns.
"""
if len(df.index) == 0:
# If there are no rows, there is nothing to do.
# Fix for https://github.com/pydata/pandas-gbq/issues/299
return df

for field in schema_fields:
column = str(field["name"])
if "mode" in field and field["mode"].upper() == "REPEATED":
Expand Down
17 changes: 12 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,19 @@
"pandas >=0.24.2",
"pyarrow >=3.0.0, <7.0dev",
"pydata-google-auth",
"google-api-core >=1.14.0",
"google-auth >=1.4.1",
# Note: google-api-core and google-auth are also included via transitive
# dependency on google-cloud-bigquery, but this library also uses them
# directly.
"google-api-core >=1.21.0",
"google-auth >=1.18.0",
"google-auth-oauthlib >=0.0.1",
# 2.4.* has a bug where waiting for the query can hang indefinitely.
# https://github.com/pydata/pandas-gbq/issues/343
"google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<4.0.0dev,!=2.4.*",
# Require 1.27.* because it has a fix for out-of-bounds timestamps. See:
# https://github.com/googleapis/python-bigquery/pull/209 and
# https://github.com/googleapis/python-bigquery-pandas/issues/365
# Exclude 2.4.* because it has a bug where waiting for the query can hang
# indefinitely. https://github.com/pydata/pandas-gbq/issues/343
"google-cloud-bigquery >=1.27.2,<4.0.0dev,!=2.4.*",
"google-cloud-bigquery-storage >=1.1.0,<3.0.0dev",
]
extras = {
"tqdm": "tqdm>=4.23.0",
Expand Down
6 changes: 3 additions & 3 deletions testing/constraints-3.7.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
# Then this file should have foo==1.14.0
db-dtypes==0.3.1
google-api-core==1.14.0
google-auth==1.4.1
google-api-core==1.21.0
google-auth==1.18.0
google-auth-oauthlib==0.0.1
google-cloud-bigquery==1.11.1
google-cloud-bigquery==1.27.2
google-cloud-bigquery-storage==1.1.0
numpy==1.16.6
pandas==0.24.2
Expand Down
Loading

0 comments on commit d120f8f

Please sign in to comment.