-
Notifications
You must be signed in to change notification settings - Fork 308
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix!: use nullable Int64
and boolean
dtypes in to_dataframe
#786
Changes from 15 commits
76d88f4
f2223e9
07ed871
66ce732
21d4369
69a747f
4f78e6d
62a57bd
d53aa68
d17e637
6ceff2c
18152d9
2e957cd
8f90c51
187a950
3155dab
189404c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
import functools | ||
import logging | ||
import queue | ||
from typing import Sequence | ||
import warnings | ||
|
||
try: | ||
|
@@ -42,15 +43,19 @@ | |
|
||
_LOGGER = logging.getLogger(__name__) | ||
|
||
_NO_BQSTORAGE_ERROR = ( | ||
"The google-cloud-bigquery-storage library is not installed, " | ||
"please install google-cloud-bigquery-storage to use bqstorage features." | ||
) | ||
|
||
_PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds. | ||
|
||
_MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads | ||
|
||
# If you update the default dtypes, also update the docs at docs/usage/pandas.rst. | ||
_BQ_TO_PANDAS_DTYPE_NULLSAFE = { | ||
"BOOL": "boolean", | ||
"BOOLEAN": "boolean", | ||
"FLOAT": "float64", | ||
"FLOAT64": "float64", | ||
"INT64": "Int64", | ||
"INTEGER": "Int64", | ||
} | ||
_PANDAS_DTYPE_TO_BQ = { | ||
"bool": "BOOLEAN", | ||
"datetime64[ns, UTC]": "TIMESTAMP", | ||
|
@@ -217,6 +222,26 @@ def bq_to_arrow_schema(bq_schema): | |
return pyarrow.schema(arrow_fields) | ||
|
||
|
||
def bq_schema_to_nullsafe_pandas_dtypes(bq_schema: Sequence[schema.SchemaField]): | ||
"""Return the default dtypes to use for columns in a BigQuery schema. | ||
|
||
Only returns default dtypes which are safe to have NULL values. This | ||
includes Int64, which has pandas.NA values and does not result in | ||
loss-of-precision. | ||
|
||
Returns: | ||
Dict[str, str]: mapping from column names to dtypes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (nit) Can be expressed as the annotation of the function return type. |
||
""" | ||
dtypes = {} | ||
for bq_field in bq_schema: | ||
if bq_field.mode.upper() not in {"NULLABLE", "REQUIRED"}: | ||
continue | ||
tswast marked this conversation as resolved.
Show resolved
Hide resolved
|
||
field_type = bq_field.field_type.upper() | ||
if field_type in _BQ_TO_PANDAS_DTYPE_NULLSAFE: | ||
dtypes[bq_field.name] = _BQ_TO_PANDAS_DTYPE_NULLSAFE[field_type] | ||
return dtypes | ||
|
||
|
||
def bq_to_arrow_array(series, bq_field): | ||
arrow_type = bq_to_arrow_data_type(bq_field) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -567,7 +567,7 @@ def test_query_results_to_dataframe(bigquery_client): | |
for _, row in df.iterrows(): | ||
for col in column_names: | ||
# all the schema fields are nullable, so None is acceptable | ||
if not row[col] is None: | ||
if not pandas.isna(row[col]): | ||
assert isinstance(row[col], exp_datatypes[col]) | ||
|
||
|
||
|
@@ -597,7 +597,7 @@ def test_query_results_to_dataframe_w_bqstorage(bigquery_client): | |
for index, row in df.iterrows(): | ||
for col in column_names: | ||
# all the schema fields are nullable, so None is acceptable | ||
if not row[col] is None: | ||
if not pandas.isna(row[col]): | ||
assert isinstance(row[col], exp_datatypes[col]) | ||
|
||
|
||
|
@@ -795,3 +795,71 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client): | |
dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) | ||
|
||
assert len(dataframe.index) == 100 | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API. | ||
) | ||
def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results): | ||
df = bigquery_client.list_rows( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to self: I'll need to exclude the INTERVAL column next time we sync with master |
||
scalars_table, max_results=max_results, | ||
).to_dataframe() | ||
|
||
assert df.dtypes["bool_col"].name == "boolean" | ||
assert df.dtypes["datetime_col"].name == "datetime64[ns]" | ||
assert df.dtypes["float64_col"].name == "float64" | ||
assert df.dtypes["int64_col"].name == "Int64" | ||
assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" | ||
|
||
# object is used by default, but we can use "datetime64[ns]" automatically | ||
# when data is within the supported range. | ||
# https://github.com/googleapis/python-bigquery/issues/861 | ||
assert df.dtypes["date_col"].name == "object" | ||
|
||
# object is used by default, but we can use "timedelta64[ns]" automatically | ||
# https://github.com/googleapis/python-bigquery/issues/862 | ||
assert df.dtypes["time_col"].name == "object" | ||
|
||
# decimal.Decimal is used to avoid loss of precision. | ||
assert df.dtypes["bignumeric_col"].name == "object" | ||
assert df.dtypes["numeric_col"].name == "object" | ||
|
||
# pandas uses Python string and bytes objects. | ||
assert df.dtypes["bytes_col"].name == "object" | ||
assert df.dtypes["string_col"].name == "object" | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API. | ||
) | ||
def test_list_rows_nullable_scalars_extreme_dtypes( | ||
bigquery_client, scalars_extreme_table, max_results | ||
): | ||
df = bigquery_client.list_rows( | ||
scalars_extreme_table, max_results=max_results | ||
).to_dataframe() | ||
|
||
# Extreme values are out-of-bounds for pandas datetime64 values, which use | ||
# nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must | ||
# be represented with object. | ||
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations | ||
assert df.dtypes["date_col"].name == "object" | ||
assert df.dtypes["datetime_col"].name == "object" | ||
assert df.dtypes["timestamp_col"].name == "object" | ||
|
||
# These pandas dtypes can handle the same ranges as BigQuery. | ||
assert df.dtypes["bool_col"].name == "boolean" | ||
assert df.dtypes["float64_col"].name == "float64" | ||
assert df.dtypes["int64_col"].name == "Int64" | ||
|
||
# object is used by default, but we can use "timedelta64[ns]" automatically | ||
# https://github.com/googleapis/python-bigquery/issues/862 | ||
assert df.dtypes["time_col"].name == "object" | ||
|
||
# decimal.Decimal is used to avoid loss of precision. | ||
assert df.dtypes["numeric_col"].name == "object" | ||
assert df.dtypes["bignumeric_col"].name == "object" | ||
|
||
# pandas uses Python string and bytes objects. | ||
assert df.dtypes["bytes_col"].name == "object" | ||
assert df.dtypes["string_col"].name == "object" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(nit)
Since already at this, there's at least on other occurrence of "python" not capitalized (line 69), which can also be fixed.