From dcd78c7861edd922e1b6009f1e2c6e0fd1a5e995 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 16 Aug 2021 10:32:28 -0500 Subject: [PATCH] fix!: use nullable `Int64` and `boolean` dtypes in `to_dataframe` (#786) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To override this behavior, specify the types for the desired columns with the `dtype` argument. BREAKING CHANGE: uses Int64 type by default to avoid loss-of-precision in results with large integer values Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes https://issuetracker.google.com/144712110 🦕 Fixes #793 --- docs/conf.py | 1 + docs/usage/pandas.rst | 27 +++- google/cloud/bigquery/_pandas_helpers.py | 37 ++++- google/cloud/bigquery/table.py | 11 +- setup.py | 2 +- testing/constraints-3.6.txt | 2 +- tests/system/test_arrow.py | 5 +- tests/system/test_pandas.py | 72 ++++++++- tests/unit/job/test_query_pandas.py | 22 +-- tests/unit/test_table.py | 8 +- tests/unit/test_table_pandas.py | 192 +++++++++++++++++++++++ 11 files changed, 340 insertions(+), 39 deletions(-) create mode 100644 tests/unit/test_table_pandas.py diff --git a/docs/conf.py b/docs/conf.py index cb347160d..09f7ea414 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,6 +110,7 @@ # directories to ignore when looking for source files. exclude_patterns = [ "_build", + "**/.nox/**/*", "samples/AUTHORING_GUIDE.md", "samples/CONTRIBUTING.md", "samples/snippets/README.rst", diff --git a/docs/usage/pandas.rst b/docs/usage/pandas.rst index 9db98dfbb..40732a298 100644 --- a/docs/usage/pandas.rst +++ b/docs/usage/pandas.rst @@ -14,12 +14,12 @@ First, ensure that the :mod:`pandas` library is installed by running: pip install --upgrade pandas -Alternatively, you can install the BigQuery python client library with +Alternatively, you can install the BigQuery Python client library with :mod:`pandas` by running: .. code-block:: bash - pip install --upgrade google-cloud-bigquery[pandas] + pip install --upgrade 'google-cloud-bigquery[pandas]' To retrieve query results as a :class:`pandas.DataFrame`: @@ -37,6 +37,27 @@ To retrieve table rows as a :class:`pandas.DataFrame`: :start-after: [START bigquery_list_rows_dataframe] :end-before: [END bigquery_list_rows_dataframe] +The following data types are used when creating a pandas DataFrame. + +.. list-table:: Pandas Data Type Mapping + :header-rows: 1 + + * - BigQuery + - pandas + - Notes + * - BOOL + - boolean + - + * - DATETIME + - datetime64[ns], object + - object is used when there are values not representable in pandas + * - FLOAT64 + - float64 + - + * - INT64 + - Int64 + - + Load a Pandas DataFrame to a BigQuery Table ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -45,7 +66,7 @@ As of version 1.3.0, you can use the to load data from a :class:`pandas.DataFrame` to a :class:`~google.cloud.bigquery.table.Table`. To use this function, in addition to :mod:`pandas`, you will need to install the :mod:`pyarrow` library. You can -install the BigQuery python client library with :mod:`pandas` and +install the BigQuery Python client library with :mod:`pandas` and :mod:`pyarrow` by running: .. code-block:: bash diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 817930ddd..88759bd18 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -18,6 +18,7 @@ import functools import logging import queue +from typing import Dict, Sequence import warnings try: @@ -42,15 +43,19 @@ _LOGGER = logging.getLogger(__name__) -_NO_BQSTORAGE_ERROR = ( - "The google-cloud-bigquery-storage library is not installed, " - "please install google-cloud-bigquery-storage to use bqstorage features." -) - _PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds. _MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads +# If you update the default dtypes, also update the docs at docs/usage/pandas.rst. +_BQ_TO_PANDAS_DTYPE_NULLSAFE = { + "BOOL": "boolean", + "BOOLEAN": "boolean", + "FLOAT": "float64", + "FLOAT64": "float64", + "INT64": "Int64", + "INTEGER": "Int64", +} _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", "datetime64[ns, UTC]": "TIMESTAMP", @@ -217,6 +222,28 @@ def bq_to_arrow_schema(bq_schema): return pyarrow.schema(arrow_fields) +def bq_schema_to_nullsafe_pandas_dtypes( + bq_schema: Sequence[schema.SchemaField], +) -> Dict[str, str]: + """Return the default dtypes to use for columns in a BigQuery schema. + + Only returns default dtypes which are safe to have NULL values. This + includes Int64, which has pandas.NA values and does not result in + loss-of-precision. + + Returns: + A mapping from column names to pandas dtypes. + """ + dtypes = {} + for bq_field in bq_schema: + if bq_field.mode.upper() not in {"NULLABLE", "REQUIRED"}: + continue + field_type = bq_field.field_type.upper() + if field_type in _BQ_TO_PANDAS_DTYPE_NULLSAFE: + dtypes[bq_field.name] = _BQ_TO_PANDAS_DTYPE_NULLSAFE[field_type] + return dtypes + + def bq_to_arrow_array(series, bq_field): arrow_type = bq_to_arrow_data_type(bq_field) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index dad06deed..4054f37fe 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1933,6 +1933,13 @@ def to_dataframe( bqstorage_client=bqstorage_client, create_bqstorage_client=create_bqstorage_client, ) + default_dtypes = _pandas_helpers.bq_schema_to_nullsafe_pandas_dtypes( + self.schema + ) + + # Let the user-defined dtypes override the default ones. + # https://stackoverflow.com/a/26853961/101923 + dtypes = {**default_dtypes, **dtypes} # When converting timestamp values to nanosecond precision, the result # can be out of pyarrow bounds. To avoid the error when converting to @@ -1954,7 +1961,9 @@ def to_dataframe( extra_kwargs = {"timestamp_as_object": timestamp_as_object} - df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) + df = record_batch.to_pandas( + date_as_object=date_as_object, integer_object_nulls=True, **extra_kwargs + ) for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column]) diff --git a/setup.py b/setup.py index 5205b5365..6fa619d37 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ # Keep the no-op bqstorage extra for backward compatibility. # See: https://github.com/googleapis/python-bigquery/issues/757 "bqstorage": [], - "pandas": ["pandas>=0.23.0"], + "pandas": ["pandas>=1.0.0"], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], "opentelemetry": [ "opentelemetry-api >= 0.11b0", diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index ce012f0d7..bf1f89f58 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -13,7 +13,7 @@ grpcio==1.38.1 opentelemetry-api==0.11b0 opentelemetry-instrumentation==0.11b0 opentelemetry-sdk==0.11b0 -pandas==0.23.0 +pandas==1.0.0 proto-plus==1.10.0 protobuf==3.12.0 pyarrow==3.0.0 diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py index f97488e39..7e20dfd7c 100644 --- a/tests/system/test_arrow.py +++ b/tests/system/test_arrow.py @@ -14,12 +14,9 @@ """System tests for Arrow connector.""" +import pyarrow import pytest -pyarrow = pytest.importorskip( - "pyarrow", minversion="3.0.0" -) # Needs decimal256 for BIGNUMERIC columns. - @pytest.mark.parametrize( ("max_results", "scalars_table_name"), diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py index 6a96dff62..411c9bed0 100644 --- a/tests/system/test_pandas.py +++ b/tests/system/test_pandas.py @@ -567,7 +567,7 @@ def test_query_results_to_dataframe(bigquery_client): for _, row in df.iterrows(): for col in column_names: # all the schema fields are nullable, so None is acceptable - if not row[col] is None: + if not pandas.isna(row[col]): assert isinstance(row[col], exp_datatypes[col]) @@ -597,7 +597,7 @@ def test_query_results_to_dataframe_w_bqstorage(bigquery_client): for index, row in df.iterrows(): for col in column_names: # all the schema fields are nullable, so None is acceptable - if not row[col] is None: + if not pandas.isna(row[col]): assert isinstance(row[col], exp_datatypes[col]) @@ -795,3 +795,71 @@ def test_list_rows_max_results_w_bqstorage(bigquery_client): dataframe = row_iterator.to_dataframe(bqstorage_client=bqstorage_client) assert len(dataframe.index) == 100 + + +@pytest.mark.parametrize( + ("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API. +) +def test_list_rows_nullable_scalars_dtypes(bigquery_client, scalars_table, max_results): + df = bigquery_client.list_rows( + scalars_table, max_results=max_results, + ).to_dataframe() + + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["datetime_col"].name == "datetime64[ns]" + assert df.dtypes["float64_col"].name == "float64" + assert df.dtypes["int64_col"].name == "Int64" + assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" + + # object is used by default, but we can use "datetime64[ns]" automatically + # when data is within the supported range. + # https://github.com/googleapis/python-bigquery/issues/861 + assert df.dtypes["date_col"].name == "object" + + # object is used by default, but we can use "timedelta64[ns]" automatically + # https://github.com/googleapis/python-bigquery/issues/862 + assert df.dtypes["time_col"].name == "object" + + # decimal.Decimal is used to avoid loss of precision. + assert df.dtypes["bignumeric_col"].name == "object" + assert df.dtypes["numeric_col"].name == "object" + + # pandas uses Python string and bytes objects. + assert df.dtypes["bytes_col"].name == "object" + assert df.dtypes["string_col"].name == "object" + + +@pytest.mark.parametrize( + ("max_results",), ((None,), (10,),) # Use BQ Storage API. # Use REST API. +) +def test_list_rows_nullable_scalars_extreme_dtypes( + bigquery_client, scalars_extreme_table, max_results +): + df = bigquery_client.list_rows( + scalars_extreme_table, max_results=max_results + ).to_dataframe() + + # Extreme values are out-of-bounds for pandas datetime64 values, which use + # nanosecond precision. Values before 1677-09-21 and after 2262-04-11 must + # be represented with object. + # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations + assert df.dtypes["date_col"].name == "object" + assert df.dtypes["datetime_col"].name == "object" + assert df.dtypes["timestamp_col"].name == "object" + + # These pandas dtypes can handle the same ranges as BigQuery. + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["float64_col"].name == "float64" + assert df.dtypes["int64_col"].name == "Int64" + + # object is used by default, but we can use "timedelta64[ns]" automatically + # https://github.com/googleapis/python-bigquery/issues/862 + assert df.dtypes["time_col"].name == "object" + + # decimal.Decimal is used to avoid loss of precision. + assert df.dtypes["numeric_col"].name == "object" + assert df.dtypes["bignumeric_col"].name == "object" + + # pandas uses Python string and bytes objects. + assert df.dtypes["bytes_col"].name == "object" + assert df.dtypes["string_col"].name == "object" diff --git a/tests/unit/job/test_query_pandas.py b/tests/unit/job/test_query_pandas.py index e5105974f..c3a9d2d1a 100644 --- a/tests/unit/job/test_query_pandas.py +++ b/tests/unit/job/test_query_pandas.py @@ -20,11 +20,6 @@ import pyarrow import pytest -try: - import pandas -except (ImportError, AttributeError): # pragma: NO COVER - pandas = None - from google.cloud import bigquery_storage try: @@ -36,6 +31,8 @@ from .helpers import _make_connection from .helpers import _make_job_resource +pandas = pytest.importorskip("pandas") + @pytest.fixture def table_read_options_kwarg(): @@ -78,7 +75,6 @@ def test__contains_order_by(query, expected): assert not mut._contains_order_by(query) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.parametrize( "query", ( @@ -413,7 +409,6 @@ def test_to_arrow_w_tqdm_wo_query_plan(): result_patch_tqdm.assert_called() -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe(): from google.cloud.bigquery.job import QueryJob as target_class @@ -452,7 +447,6 @@ def test_to_dataframe(): assert list(df) == ["name", "age"] # verify the column names -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_ddl_query(): from google.cloud.bigquery.job import QueryJob as target_class @@ -472,7 +466,6 @@ def test_to_dataframe_ddl_query(): assert len(df) == 0 -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_bqstorage(table_read_options_kwarg): from google.cloud.bigquery.job import QueryJob as target_class @@ -522,7 +515,6 @@ def test_to_dataframe_bqstorage(table_read_options_kwarg): ) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_bqstorage_no_pyarrow_compression(): from google.cloud.bigquery.job import QueryJob as target_class @@ -565,7 +557,6 @@ def test_to_dataframe_bqstorage_no_pyarrow_compression(): ) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_column_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class @@ -617,15 +608,14 @@ def test_to_dataframe_column_dtypes(): assert list(df) == exp_columns # verify the column names assert df.start_timestamp.dtype.name == "datetime64[ns, UTC]" - assert df.seconds.dtype.name == "int64" + assert df.seconds.dtype.name == "Int64" assert df.miles.dtype.name == "float64" assert df.km.dtype.name == "float16" assert df.payment_type.dtype.name == "object" - assert df.complete.dtype.name == "bool" + assert df.complete.dtype.name == "boolean" assert df.date.dtype.name == "object" -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_to_dataframe_column_date_dtypes(): from google.cloud.bigquery.job import QueryJob as target_class @@ -657,7 +647,6 @@ def test_to_dataframe_column_date_dtypes(): assert df.date.dtype.name == "datetime64[ns]" -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") @mock.patch("tqdm.tqdm") def test_to_dataframe_with_progress_bar(tqdm_mock): @@ -685,7 +674,6 @@ def test_to_dataframe_with_progress_bar(tqdm_mock): tqdm_mock.assert_called() -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_dataframe_w_tqdm_pending(): from google.cloud.bigquery import table @@ -741,7 +729,6 @@ def test_to_dataframe_w_tqdm_pending(): ) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_dataframe_w_tqdm(): from google.cloud.bigquery import table @@ -801,7 +788,6 @@ def test_to_dataframe_w_tqdm(): ) -@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") @pytest.mark.skipif(tqdm is None, reason="Requires `tqdm`") def test_to_dataframe_w_tqdm_max_results(): from google.cloud.bigquery import table diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 66543bb38..44d02f14c 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -2761,7 +2761,7 @@ def test_to_dataframe(self): self.assertEqual(len(df), 4) # verify the number of rows self.assertEqual(list(df), ["name", "age"]) # verify the column names self.assertEqual(df.name.dtype.name, "object") - self.assertEqual(df.age.dtype.name, "int64") + self.assertEqual(df.age.dtype.name, "Int64") @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self): @@ -3004,7 +3004,7 @@ def test_to_dataframe_w_various_types_nullable(self): self.assertTrue(row.isnull().all()) else: self.assertIsInstance(row.start_timestamp, pandas.Timestamp) - self.assertIsInstance(row.seconds, float) + self.assertIsInstance(row.seconds, int) self.assertIsInstance(row.payment_type, str) self.assertIsInstance(row.complete, bool) self.assertIsInstance(row.date, datetime.date) @@ -3050,11 +3050,11 @@ def test_to_dataframe_column_dtypes(self): self.assertEqual(list(df), exp_columns) # verify the column names self.assertEqual(df.start_timestamp.dtype.name, "datetime64[ns, UTC]") - self.assertEqual(df.seconds.dtype.name, "int64") + self.assertEqual(df.seconds.dtype.name, "Int64") self.assertEqual(df.miles.dtype.name, "float64") self.assertEqual(df.km.dtype.name, "float16") self.assertEqual(df.payment_type.dtype.name, "object") - self.assertEqual(df.complete.dtype.name, "bool") + self.assertEqual(df.complete.dtype.name, "boolean") self.assertEqual(df.date.dtype.name, "object") @mock.patch("google.cloud.bigquery.table.pandas", new=None) diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py new file mode 100644 index 000000000..a223e6652 --- /dev/null +++ b/tests/unit/test_table_pandas.py @@ -0,0 +1,192 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import decimal +from unittest import mock + +import pyarrow +import pytest + +from google.cloud import bigquery + +pandas = pytest.importorskip("pandas") + + +TEST_PATH = "/v1/project/test-proj/dataset/test-dset/table/test-tbl/data" + + +@pytest.fixture +def class_under_test(): + from google.cloud.bigquery.table import RowIterator + + return RowIterator + + +def test_to_dataframe_nullable_scalars(monkeypatch, class_under_test): + # See tests/system/test_arrow.py for the actual types we get from the API. + arrow_schema = pyarrow.schema( + [ + pyarrow.field("bignumeric_col", pyarrow.decimal256(76, scale=38)), + pyarrow.field("bool_col", pyarrow.bool_()), + pyarrow.field("bytes_col", pyarrow.binary()), + pyarrow.field("date_col", pyarrow.date32()), + pyarrow.field("datetime_col", pyarrow.timestamp("us", tz=None)), + pyarrow.field("float64_col", pyarrow.float64()), + pyarrow.field("int64_col", pyarrow.int64()), + pyarrow.field("numeric_col", pyarrow.decimal128(38, scale=9)), + pyarrow.field("string_col", pyarrow.string()), + pyarrow.field("time_col", pyarrow.time64("us")), + pyarrow.field( + "timestamp_col", pyarrow.timestamp("us", tz=datetime.timezone.utc) + ), + ] + ) + arrow_table = pyarrow.Table.from_pydict( + { + "bignumeric_col": [decimal.Decimal("123.456789101112131415")], + "bool_col": [True], + "bytes_col": [b"Hello,\x00World!"], + "date_col": [datetime.date(2021, 8, 9)], + "datetime_col": [datetime.datetime(2021, 8, 9, 13, 30, 44, 123456)], + "float64_col": [1.25], + "int64_col": [-7], + "numeric_col": [decimal.Decimal("-123.456789")], + "string_col": ["abcdefg"], + "time_col": [datetime.time(14, 21, 17, 123456)], + "timestamp_col": [ + datetime.datetime( + 2021, 8, 9, 13, 30, 44, 123456, tzinfo=datetime.timezone.utc + ) + ], + }, + schema=arrow_schema, + ) + + nullable_schema = [ + bigquery.SchemaField("bignumeric_col", "BIGNUMERIC"), + bigquery.SchemaField("bool_col", "BOOLEAN"), + bigquery.SchemaField("bytes_col", "BYTES"), + bigquery.SchemaField("date_col", "DATE"), + bigquery.SchemaField("datetime_col", "DATETIME"), + bigquery.SchemaField("float64_col", "FLOAT"), + bigquery.SchemaField("int64_col", "INT64"), + bigquery.SchemaField("numeric_col", "NUMERIC"), + bigquery.SchemaField("string_col", "STRING"), + bigquery.SchemaField("time_col", "TIME"), + bigquery.SchemaField("timestamp_col", "TIMESTAMP"), + ] + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + mock_to_arrow = mock.Mock() + mock_to_arrow.return_value = arrow_table + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, nullable_schema) + monkeypatch.setattr(rows, "to_arrow", mock_to_arrow) + df = rows.to_dataframe() + + # Check for expected dtypes. + # Keep these in sync with tests/system/test_pandas.py + assert df.dtypes["bignumeric_col"].name == "object" + assert df.dtypes["bool_col"].name == "boolean" + assert df.dtypes["bytes_col"].name == "object" + assert df.dtypes["date_col"].name == "object" + assert df.dtypes["datetime_col"].name == "datetime64[ns]" + assert df.dtypes["float64_col"].name == "float64" + assert df.dtypes["int64_col"].name == "Int64" + assert df.dtypes["numeric_col"].name == "object" + assert df.dtypes["string_col"].name == "object" + assert df.dtypes["time_col"].name == "object" + assert df.dtypes["timestamp_col"].name == "datetime64[ns, UTC]" + + # Check for expected values. + assert df["bignumeric_col"][0] == decimal.Decimal("123.456789101112131415") + assert df["bool_col"][0] # True + assert df["bytes_col"][0] == b"Hello,\x00World!" + + # object is used by default, but we can use "datetime64[ns]" automatically + # when data is within the supported range. + # https://github.com/googleapis/python-bigquery/issues/861 + assert df["date_col"][0] == datetime.date(2021, 8, 9) + + assert df["datetime_col"][0] == pandas.to_datetime("2021-08-09 13:30:44.123456") + assert df["float64_col"][0] == 1.25 + assert df["int64_col"][0] == -7 + assert df["numeric_col"][0] == decimal.Decimal("-123.456789") + assert df["string_col"][0] == "abcdefg" + + # Pandas timedelta64 might be a better choice for pandas time columns. Then + # they can more easily be combined with date columns to form datetimes. + # https://github.com/googleapis/python-bigquery/issues/862 + assert df["time_col"][0] == datetime.time(14, 21, 17, 123456) + + assert df["timestamp_col"][0] == pandas.to_datetime("2021-08-09 13:30:44.123456Z") + + +def test_to_dataframe_nullable_scalars_with_custom_dtypes( + monkeypatch, class_under_test +): + """Passing in explicit dtypes is merged with default behavior.""" + arrow_schema = pyarrow.schema( + [ + pyarrow.field("int64_col", pyarrow.int64()), + pyarrow.field("other_int_col", pyarrow.int64()), + ] + ) + arrow_table = pyarrow.Table.from_pydict( + {"int64_col": [1000], "other_int_col": [-7]}, schema=arrow_schema, + ) + + nullable_schema = [ + bigquery.SchemaField("int64_col", "INT64"), + bigquery.SchemaField("other_int_col", "INT64"), + ] + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + mock_to_arrow = mock.Mock() + mock_to_arrow.return_value = arrow_table + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, nullable_schema) + monkeypatch.setattr(rows, "to_arrow", mock_to_arrow) + df = rows.to_dataframe(dtypes={"other_int_col": "int8"}) + + assert df.dtypes["int64_col"].name == "Int64" + assert df["int64_col"][0] == 1000 + + assert df.dtypes["other_int_col"].name == "int8" + assert df["other_int_col"][0] == -7 + + +def test_to_dataframe_arrays(monkeypatch, class_under_test): + arrow_schema = pyarrow.schema( + [pyarrow.field("int64_repeated", pyarrow.list_(pyarrow.int64()))] + ) + arrow_table = pyarrow.Table.from_pydict( + {"int64_repeated": [[-1, 0, 2]]}, schema=arrow_schema, + ) + + nullable_schema = [ + bigquery.SchemaField("int64_repeated", "INT64", mode="REPEATED"), + ] + mock_client = mock.create_autospec(bigquery.Client) + mock_client.project = "test-proj" + mock_api_request = mock.Mock() + mock_to_arrow = mock.Mock() + mock_to_arrow.return_value = arrow_table + rows = class_under_test(mock_client, mock_api_request, TEST_PATH, nullable_schema) + monkeypatch.setattr(rows, "to_arrow", mock_to_arrow) + df = rows.to_dataframe() + + assert df.dtypes["int64_repeated"].name == "object" + assert tuple(df["int64_repeated"][0]) == (-1, 0, 2)