diff --git a/.circleci/config.yml b/.circleci/config.yml index 4a88a182..0af15778 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -45,12 +45,12 @@ jobs: - run: nox -s lint # Conda - "conda-3.6-0.20.1": + "conda-3.6-0.24.0": docker: - image: continuumio/miniconda3 environment: PYTHON: "3.6" - PANDAS: "0.20.1" + PANDAS: "0.24.0" steps: - checkout - run: ci/config_auth.sh @@ -65,4 +65,4 @@ workflows: - "pip-3.6" - "pip-3.7" - lint - - "conda-3.6-0.20.1" \ No newline at end of file + - "conda-3.6-0.24.0" \ No newline at end of file diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 46c79306..912e7a35 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -1,5 +1,5 @@ mock -pandas==0.19.0 +pandas==0.24.0 google-auth==1.4.1 google-auth-oauthlib==0.0.1 google-cloud-bigquery==1.9.0 diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 41a41891..f8c3cef2 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1,4 +1,4 @@ -pandas==0.19.0 +pandas==0.24.0 google-auth==1.4.1 google-auth-oauthlib==0.0.1 google-cloud-bigquery==1.9.0 diff --git a/ci/requirements-3.6-0.20.1.conda b/ci/requirements-3.6-0.24.0.conda similarity index 100% rename from ci/requirements-3.6-0.20.1.conda rename to ci/requirements-3.6-0.24.0.conda diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 9e0fbd9d..4977e3f4 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -6,14 +6,12 @@ Changelog 0.10.0 / TBD ------------ -- This fixes a bug where pandas-gbq could not upload an empty database. (:issue:`237`) - Dependency updates ~~~~~~~~~~~~~~~~~~ - Update the minimum version of ``google-cloud-bigquery`` to 1.9.0. (:issue:`247`) -- Update the minimum version of ``pandas`` to 0.19.0. (:issue:`262`) +- Update the minimum version of ``pandas`` to 0.24.0. (:issue:`263`) Internal changes ~~~~~~~~~~~~~~~~ @@ -23,11 +21,21 @@ Internal changes Enhancements ~~~~~~~~~~~~ + - Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns, with the rest being populated using the DataFrame dtypes (:issue:`218`) (contributed by @johnpaton) - Read ``project_id`` in :func:`to_gbq` from provided ``credentials`` if available (contributed by @daureg) +- ``read_gbq`` uses the timezone-aware ``DatetimeTZDtype(unit='ns', + tz='UTC')`` dtype for BigQuery ``TIMESTAMP`` columns. (:issue:`263`) + +Bug fixes +~~~~~~~~~ + +- Fix a bug where pandas-gbq could not upload an empty database. + (:issue:`237`) + .. _changelog-0.9.0: diff --git a/docs/source/reading.rst b/docs/source/reading.rst index add61ed2..838b36e0 100644 --- a/docs/source/reading.rst +++ b/docs/source/reading.rst @@ -9,21 +9,32 @@ Suppose you want to load all data from an existing BigQuery table .. code-block:: python - # Insert your BigQuery Project ID Here - # Can be found in the Google web console + import pandas_gbq + + # TODO: Set your BigQuery Project ID. projectid = "xxxxxxxx" - data_frame = read_gbq('SELECT * FROM test_dataset.test_table', projectid) + data_frame = pandas_gbq.read_gbq( + 'SELECT * FROM `test_dataset.test_table`', + project_id=projectid) + +.. note:: + A project ID is sometimes optional if it can be inferred during + authentication, but it is required when authenticating with user + credentials. You can find your project ID in the `Google Cloud console + `__. You can define which column from BigQuery to use as an index in the destination DataFrame as well as a preferred column order as follows: .. code-block:: python - data_frame = read_gbq('SELECT * FROM test_dataset.test_table', - index_col='index_column_name', - col_order=['col1', 'col2', 'col3'], projectid) + data_frame = pandas_gbq.read_gbq( + 'SELECT * FROM `test_dataset.test_table`', + project_id=projectid, + index_col='index_column_name', + col_order=['col1', 'col2', 'col3']) You can specify the query config as parameter to use additional options of @@ -37,20 +48,39 @@ your job. For more information about query configuration parameters see `here "useQueryCache": False } } - data_frame = read_gbq('SELECT * FROM test_dataset.test_table', - configuration=configuration, projectid) + data_frame = read_gbq( + 'SELECT * FROM `test_dataset.test_table`', + project_id=projectid, + configuration=configuration) -.. note:: +The ``dialect`` argument can be used to indicate whether to use +BigQuery's ``'legacy'`` SQL or BigQuery's ``'standard'`` SQL (beta). The +default value is ``'standard'`` For more information on BigQuery's standard +SQL, see `BigQuery SQL Reference +`__ - You can find your project id in the `Google developers console - `__. +.. code-block:: python + data_frame = pandas_gbq.read_gbq( + 'SELECT * FROM [test_dataset.test_table]', + project_id=projectid, + dialect='legacy') -.. note:: - The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL - or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``, though this will change - in a subsequent release to ``'standard'``. For more information - on BigQuery's standard SQL, see `BigQuery SQL Reference - `__ +.. _reading-dtypes: + +Inferring the DataFrame's dtypes +-------------------------------- + +The :func:`~pandas_gbq.read_gbq` method infers the pandas dtype for each column, based on the BigQuery table schema. + +================== ========================= +BigQuery Data Type dtype +================== ========================= +FLOAT float +TIMESTAMP DatetimeTZDtype(unit='ns', tz='UTC') +DATETIME datetime64[ns] +TIME datetime64[ns] +DATE datetime64[ns] +================== ========================= diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 17d18263..d557cfdb 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -644,21 +644,24 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema): def _bqschema_to_nullsafe_dtypes(schema_fields): - # Only specify dtype when the dtype allows nulls. Otherwise, use pandas's - # default dtype choice. - # - # See: - # http://pandas.pydata.org/pandas-docs/dev/missing_data.html - # #missing-data-casting-rules-and-indexing + """Specify explicit dtypes based on BigQuery schema. + + This function only specifies a dtype when the dtype allows nulls. + Otherwise, use pandas's default dtype choice. + + See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html + #missing-data-casting-rules-and-indexing + """ + import pandas.api.types + + # If you update this mapping, also update the table at + # `docs/source/reading.rst`. dtype_map = { "FLOAT": np.dtype(float), - # Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't - # support datetime64[ns, UTC] as dtype in DataFrame constructors. See: - # https://github.com/pandas-dev/pandas/issues/12513 - "TIMESTAMP": "datetime64[ns]", + "TIMESTAMP": pandas.api.types.DatetimeTZDtype(tz="UTC"), + "DATETIME": "datetime64[ns]", "TIME": "datetime64[ns]", "DATE": "datetime64[ns]", - "DATETIME": "datetime64[ns]", } dtypes = {} diff --git a/setup.py b/setup.py index 8e36e54a..7ef6330e 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def readme(): INSTALL_REQUIRES = [ "setuptools", - "pandas>=0.19.0", + "pandas>=0.24.0", "pydata-google-auth", "google-auth", "google-auth-oauthlib", diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 4480f203..e04aeda5 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -314,7 +314,7 @@ def test_should_properly_handle_timestamp_unix_epoch(self, project_id): df, DataFrame( {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, - dtype="datetime64[ns]", + dtype=pandas.api.types.DatetimeTZDtype(tz="UTC"), ), ) @@ -330,7 +330,7 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): df, DataFrame( {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, - dtype="datetime64[ns]", + dtype=pandas.api.types.DatetimeTZDtype(tz="UTC"), ), ) @@ -368,7 +368,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id): "expression, is_expected_dtype", [ ("current_date()", pandas.api.types.is_datetime64_ns_dtype), - ("current_timestamp()", pandas.api.types.is_datetime64_ns_dtype), + ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype), ("current_datetime()", pandas.api.types.is_datetime64_ns_dtype), ("TRUE", pandas.api.types.is_bool_dtype), ("FALSE", pandas.api.types.is_bool_dtype), @@ -403,7 +403,11 @@ def test_should_properly_handle_null_timestamp(self, project_id): dialect="legacy", ) tm.assert_frame_equal( - df, DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]") + df, + DataFrame( + {"null_timestamp": [NaT]}, + dtype=pandas.api.types.DatetimeTZDtype(tz="UTC"), + ), ) def test_should_properly_handle_null_datetime(self, project_id): @@ -589,7 +593,9 @@ def test_zero_rows(self, project_id): "title": pandas.Series([], dtype=object), "id": pandas.Series([], dtype=np.dtype(int)), "is_bot": pandas.Series([], dtype=np.dtype(bool)), - "ts": pandas.Series([], dtype="datetime64[ns]"), + "ts": pandas.Series( + [], dtype=pandas.api.types.DatetimeTZDtype(tz="UTC") + ), } expected_result = DataFrame( empty_columns, columns=["title", "id", "is_bot", "ts"] diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 3a047741..e58b82c4 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -3,6 +3,7 @@ import pandas.util.testing as tm import pytest import numpy +import pandas.api.types from pandas import DataFrame import pandas_gbq.exceptions @@ -90,7 +91,7 @@ def no_auth(monkeypatch): ("INTEGER", None), # Can't handle NULL ("BOOLEAN", None), # Can't handle NULL ("FLOAT", numpy.dtype(float)), - ("TIMESTAMP", "datetime64[ns]"), + ("TIMESTAMP", pandas.api.types.DatetimeTZDtype(tz="UTC")), ("DATETIME", "datetime64[ns]"), ], )