-
Notifications
You must be signed in to change notification settings - Fork 125
fix: read_gbq
supports extreme DATETIME values such as 0001-01-01 00:00:00
#444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
933d470
9a9d3fd
2a76982
4695c5f
6adf233
73a791a
9b1eb0d
ec9ddaf
9cc7c74
dd51ad8
e1ad679
d29bc2a
cb8f24f
56b73b2
8a61e97
3f7900b
ae3e044
3c53f1f
c98982d
f0acde6
362a26d
752d67c
5b46127
254f6a0
c0780b6
9aaedc6
b03443b
cd6ae70
11126a6
14e6070
8f92d9b
9985d15
6fb73a2
8cc4524
a0d6cad
dfa6942
82c5362
de4a06e
9fc8c08
c5c0e85
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,13 +2,13 @@ | |
# Use of this source code is governed by a BSD-style | ||
# license that can be found in the LICENSE file. | ||
|
||
from datetime import datetime | ||
import logging | ||
import re | ||
import time | ||
import warnings | ||
from datetime import datetime | ||
import typing | ||
from typing import Any, Dict, Optional, Union | ||
from typing import Any, Dict, Optional, Sequence, Union | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
|
@@ -37,21 +37,25 @@ | |
import pandas_gbq.schema | ||
import pandas_gbq.timestamp | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
try: | ||
import tqdm # noqa | ||
except ImportError: | ||
tqdm = None | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def _test_google_api_imports(): | ||
try: | ||
import pkg_resources # noqa | ||
except ImportError as ex: | ||
raise ImportError("pandas-gbq requires setuptools") from ex | ||
|
||
try: | ||
import db_dtypes # noqa | ||
except ImportError as ex: | ||
raise ImportError("pandas-gbq requires db-dtypes") from ex | ||
|
||
try: | ||
import pydata_google_auth # noqa | ||
except ImportError as ex: | ||
|
@@ -546,6 +550,8 @@ def _download_results( | |
to_dataframe_kwargs = {} | ||
if FEATURES.bigquery_has_bqstorage: | ||
to_dataframe_kwargs["create_bqstorage_client"] = create_bqstorage_client | ||
if FEATURES.bigquery_needs_date_as_object: | ||
to_dataframe_kwargs["date_as_object"] = True | ||
|
||
try: | ||
schema_fields = [field.to_api_repr() for field in rows_iter.schema] | ||
|
@@ -559,11 +565,7 @@ def _download_results( | |
except self.http_error as ex: | ||
self.process_http_error(ex) | ||
|
||
if df.empty: | ||
df = _cast_empty_df_dtypes(schema_fields, df) | ||
|
||
# Ensure any TIMESTAMP columns are tz-aware. | ||
df = pandas_gbq.timestamp.localize_df(df, schema_fields) | ||
df = _finalize_dtypes(df, schema_fields) | ||
|
||
logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) | ||
return df | ||
|
@@ -617,23 +619,18 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): | |
See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html | ||
#missing-data-casting-rules-and-indexing | ||
""" | ||
import db_dtypes | ||
|
||
# If you update this mapping, also update the table at | ||
# `docs/reading.rst`. | ||
dtype_map = { | ||
"DATE": "datetime64[ns]", | ||
"DATETIME": "datetime64[ns]", | ||
"FLOAT": np.dtype(float), | ||
"GEOMETRY": "object", | ||
"INTEGER": "Int64", | ||
"RECORD": "object", | ||
"STRING": "object", | ||
# datetime.time objects cannot be case to datetime64. | ||
# https://github.com/pydata/pandas-gbq/issues/328 | ||
"TIME": "object", | ||
# pandas doesn't support timezone-aware dtype in DataFrame/Series | ||
# constructors. It's more idiomatic to localize after construction. | ||
# https://github.com/pandas-dev/pandas/issues/25843 | ||
"TIMESTAMP": "datetime64[ns]", | ||
"TIME": db_dtypes.TimeDtype(), | ||
# Note: Other types such as 'datetime64[ns]' and db_types.DateDtype() | ||
# are not included because the pandas range does not align with the | ||
# BigQuery range. We need to attempt a conversion to those types and | ||
# fall back to 'object' when there are out-of-range values. | ||
} | ||
|
||
# Amend dtype_map with newer extension types if pandas version allows. | ||
|
@@ -656,28 +653,43 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): | |
return dtypes | ||
|
||
|
||
def _cast_empty_df_dtypes(schema_fields, df): | ||
"""Cast any columns in an empty dataframe to correct type. | ||
def _finalize_dtypes( | ||
df: "pandas.DataFrame", schema_fields: Sequence[Dict[str, Any]] | ||
) -> "pandas.DataFrame": | ||
""" | ||
Attempt to change the dtypes of those columns that don't map exactly. | ||
|
||
In an empty dataframe, pandas cannot choose a dtype unless one is | ||
explicitly provided. The _bqschema_to_nullsafe_dtypes() function only | ||
provides dtypes when the dtype safely handles null values. This means | ||
that empty int64 and boolean columns are incorrectly classified as | ||
``object``. | ||
For example db_dtypes.DateDtype() and datetime64[ns] cannot represent | ||
0001-01-01, but they can represent dates within a couple hundred years of | ||
1970. See: | ||
https://github.com/googleapis/python-bigquery-pandas/issues/365 | ||
""" | ||
if not df.empty: | ||
raise ValueError("DataFrame must be empty in order to cast non-nullsafe dtypes") | ||
import db_dtypes | ||
import pandas.api.types | ||
|
||
dtype_map = {"BOOLEAN": bool, "INTEGER": np.int64} | ||
# If you update this mapping, also update the table at | ||
# `docs/reading.rst`. | ||
dtype_map = { | ||
"DATE": db_dtypes.DateDtype(), | ||
"DATETIME": "datetime64[ns]", | ||
"TIMESTAMP": "datetime64[ns]", | ||
} | ||
|
||
for field in schema_fields: | ||
column = str(field["name"]) | ||
# This method doesn't modify ARRAY/REPEATED columns. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this imply a TODO for later, or is the nature of pandas such that arrays are just always an object that gets no special processing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential TODO, but such a low priority I don't think it's worth calling out. Now that we have https://github.com/googleapis/python-db-dtypes-pandas we have more flexibility in terms of creating dtypes that are more efficient than Python object columns. Though in this case, I'm not sure we'd have any better of an approach than https://github.com/xhochy/fletcher |
||
if field["mode"].upper() == "REPEATED": | ||
continue | ||
|
||
name = str(field["name"]) | ||
dtype = dtype_map.get(field["type"].upper()) | ||
if dtype: | ||
df[column] = df[column].astype(dtype) | ||
|
||
# Avoid deprecated conversion to timezone-naive dtype by only casting | ||
# object dtypes. | ||
if dtype and pandas.api.types.is_object_dtype(df[name]): | ||
df[name] = df[name].astype(dtype, errors="ignore") | ||
|
||
# Ensure any TIMESTAMP columns are tz-aware. | ||
df = pandas_gbq.timestamp.localize_df(df, schema_fields) | ||
|
||
return df | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,12 +28,19 @@ | |
"pandas >=0.24.2", | ||
"pyarrow >=3.0.0, <7.0dev", | ||
"pydata-google-auth", | ||
"google-api-core >=1.14.0", | ||
"google-auth >=1.4.1", | ||
# Note: google-api-core and google-auth are also included via transitive | ||
# dependency on google-cloud-bigquery, but this library also uses them | ||
# directly. | ||
"google-api-core >=1.21.0", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems unrelated, and unmotivated by anything in the changelog for that release. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct. Needed to update due to updating the minimum We do use |
||
"google-auth >=1.18.0", | ||
"google-auth-oauthlib >=0.0.1", | ||
# 2.4.* has a bug where waiting for the query can hang indefinitely. | ||
# https://github.com/pydata/pandas-gbq/issues/343 | ||
"google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<4.0.0dev,!=2.4.*", | ||
# Require 1.27.* because it has a fix for out-of-bounds timestamps. See: | ||
# https://github.com/googleapis/python-bigquery/pull/209 and | ||
# https://github.com/googleapis/python-bigquery-pandas/issues/365 | ||
# Exclude 2.4.* because it has a bug where waiting for the query can hang | ||
# indefinitely. https://github.com/pydata/pandas-gbq/issues/343 | ||
"google-cloud-bigquery >=1.27.2,<4.0.0dev,!=2.4.*", | ||
"google-cloud-bigquery-storage >=1.1.0,<3.0.0dev", | ||
] | ||
extras = { | ||
"tqdm": "tqdm>=4.23.0", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,10 +6,10 @@ | |
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", | ||
# Then this file should have foo==1.14.0 | ||
db-dtypes==0.3.1 | ||
google-api-core==1.14.0 | ||
google-auth==1.4.1 | ||
google-api-core==1.21.0 | ||
google-auth==1.18.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't match the minimum constraint in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated |
||
google-auth-oauthlib==0.0.1 | ||
google-cloud-bigquery==1.11.1 | ||
google-cloud-bigquery==1.27.2 | ||
google-cloud-bigquery-storage==1.1.0 | ||
numpy==1.16.6 | ||
pandas==0.24.2 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure how the changes here to non-datetime-related mappings relates to this PR.
If these changes are intentional, then the comment above seems to require a corresponding update to
docs/reading.rst
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
object
types were removed because it's the default anyway.