From 3ae5d4c234f541fd4ab8117a0c92205cd1698e40 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 28 Dec 2021 14:04:01 -0600 Subject: [PATCH] test: improve `to_gbq` logic unit test coverage (#449) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🦉 Updates from OwlBot See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md Co-authored-by: Owl Bot --- .coveragerc | 2 +- noxfile.py | 2 +- owlbot.py | 2 +- pandas_gbq/gbq.py | 41 +++++---- pandas_gbq/load.py | 7 +- pandas_gbq/schema.py | 14 ++- setup.py | 5 +- testing/constraints-3.7.txt | 1 + tests/system/test_gbq.py | 5 -- tests/unit/conftest.py | 59 ++++++------- tests/unit/test_auth.py | 25 +++--- tests/unit/test_context.py | 16 ++++ tests/unit/test_features.py | 19 +++++ tests/unit/test_gbq.py | 161 +++++++++++++++++++++++++++++------ tests/unit/test_load.py | 113 ++++++++++++++++++++++++ tests/unit/test_schema.py | 37 ++++++++ tests/unit/test_timestamp.py | 9 ++ tests/unit/test_to_gbq.py | 122 ++++++++++++++++++++++++++ 18 files changed, 542 insertions(+), 98 deletions(-) create mode 100644 tests/unit/test_to_gbq.py diff --git a/.coveragerc b/.coveragerc index 88b85d03..0a3b1cea 100644 --- a/.coveragerc +++ b/.coveragerc @@ -22,7 +22,7 @@ omit = google/cloud/__init__.py [report] -fail_under = 89 +fail_under = 94 show_missing = True exclude_lines = # Re-enable the standard pragma diff --git a/noxfile.py b/noxfile.py index 398b4dc2..5e41983b 100644 --- a/noxfile.py +++ b/noxfile.py @@ -259,7 +259,7 @@ def cover(session): test runs (not system test runs), and then erases coverage data. """ session.install("coverage", "pytest-cov") - session.run("coverage", "report", "--show-missing", "--fail-under=89") + session.run("coverage", "report", "--show-missing", "--fail-under=94") session.run("coverage", "erase") diff --git a/owlbot.py b/owlbot.py index 9849f98f..62c9f3c4 100644 --- a/owlbot.py +++ b/owlbot.py @@ -33,7 +33,7 @@ templated_files = common.py_library( unit_test_python_versions=["3.7", "3.8", "3.9", "3.10"], system_test_python_versions=["3.7", "3.8", "3.9", "3.10"], - cov_level=89, + cov_level=94, unit_test_extras=extras, system_test_extras=extras, intersphinx_dependencies={ diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 5dcc3fd0..0a18cc3a 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -121,7 +121,20 @@ class InvalidSchema(ValueError): table in BigQuery. """ - pass + def __init__( + self, message: str, local_schema: Dict[str, Any], remote_schema: Dict[str, Any] + ): + super().__init__(message) + self._local_schema = local_schema + self._remote_schema = remote_schema + + @property + def local_schema(self) -> Dict[str, Any]: + return self._local_schema + + @property + def remote_schema(self) -> Dict[str, Any]: + return self._remote_schema class NotFoundException(ValueError): @@ -354,19 +367,12 @@ def sizeof_fmt(num, suffix="B"): return fmt % (num, "Y", suffix) def get_client(self): + import google.api_core.client_info import pandas - try: - # This module was added in google-api-core 1.11.0. - # We don't have a hard requirement on that version, so only - # populate the client_info if available. - import google.api_core.client_info - - client_info = google.api_core.client_info.ClientInfo( - user_agent="pandas-{}".format(pandas.__version__) - ) - except ImportError: - client_info = None + client_info = google.api_core.client_info.ClientInfo( + user_agent="pandas-{}".format(pandas.__version__) + ) # In addition to new enough version of google-api-core, a new enough # version of google-cloud-bigquery is required to populate the @@ -1057,7 +1063,7 @@ def to_gbq( DeprecationWarning, stacklevel=2, ) - elif api_method == "load_csv": + else: warnings.warn( "chunksize will be ignored when using api_method='load_csv' in a future version of pandas-gbq", PendingDeprecationWarning, @@ -1122,12 +1128,14 @@ def to_gbq( ) elif if_exists == "replace": connector.delete_and_recreate_table(dataset_id, table_id, table_schema) - elif if_exists == "append": + else: if not pandas_gbq.schema.schema_is_subset(original_schema, table_schema): raise InvalidSchema( "Please verify that the structure and " "data types in the DataFrame match the " - "schema of the destination table." + "schema of the destination table.", + table_schema, + original_schema, ) # Update the local `table_schema` so mode (NULLABLE/REQUIRED) @@ -1283,9 +1291,6 @@ def delete(self, table_id): """ from google.api_core.exceptions import NotFound - if not self.exists(table_id): - raise NotFoundException("Table does not exist") - table_ref = self._table_ref(table_id) try: self.client.delete_table(table_ref) diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index 315ad5cd..588a6719 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -185,6 +185,11 @@ def load_csv_from_file( chunksize: Optional[int], schema: Optional[Dict[str, Any]], ): + """Manually encode a DataFrame to CSV and use the buffer in a load job. + + This method is needed for writing with google-cloud-bigquery versions that + don't implment load_table_from_dataframe with the CSV serialization format. + """ if schema is None: schema = pandas_gbq.schema.generate_bq_schema(dataframe) @@ -203,7 +208,7 @@ def load_chunk(chunk, job_config): finally: chunk_buffer.close() - return load_csv(dataframe, chunksize, bq_schema, load_chunk,) + return load_csv(dataframe, chunksize, bq_schema, load_chunk) def load_chunks( diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py index e2f97455..cfa1c765 100644 --- a/pandas_gbq/schema.py +++ b/pandas_gbq/schema.py @@ -21,7 +21,19 @@ def to_pandas_gbq(client_schema): """Given a sequence of :class:`google.cloud.bigquery.schema.SchemaField`, return a schema in pandas-gbq API format. """ - remote_fields = [field_remote.to_api_repr() for field_remote in client_schema] + remote_fields = [ + # Filter out default values. google-cloud-bigquery versions before + # 2.31.0 (https://github.com/googleapis/python-bigquery/pull/557) + # include a description key, even if not explicitly set. This has the + # potential to unset the description unintentionally in cases where + # pandas-gbq is updating the schema. + { + key: value + for key, value in field_remote.to_api_repr().items() + if value is not None + } + for field_remote in client_schema + ] for field in remote_fields: field["type"] = field["type"].upper() field["mode"] = field["mode"].upper() diff --git a/setup.py b/setup.py index 4be5a722..2e596cc6 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,9 @@ "pandas >=0.24.2", "pyarrow >=3.0.0, <7.0dev", "pydata-google-auth", - "google-auth", - "google-auth-oauthlib", + "google-api-core >=1.14.0", + "google-auth >=1.4.1", + "google-auth-oauthlib >=0.0.1", # 2.4.* has a bug where waiting for the query can hang indefinitely. # https://github.com/pydata/pandas-gbq/issues/343 "google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<4.0.0dev,!=2.4.*", diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 6c3080dc..2a500f35 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -6,6 +6,7 @@ # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 db-dtypes==0.3.1 +google-api-core==1.14.0 google-auth==1.4.1 google-auth-oauthlib==0.0.1 google-cloud-bigquery==1.11.1 diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 812d2089..67735c53 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -1522,11 +1522,6 @@ def test_delete_table(gbq_table): assert not gbq_table.exists("test_delete_table") -def test_delete_table_not_found(gbq_table): - with pytest.raises(gbq.NotFoundException): - gbq_table.delete("test_delete_table_not_found") - - def test_create_table_data_dataset_does_not_exist( project, credentials, gbq_dataset, random_dataset_id ): diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 3f0c5e53..50a9ac1a 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -9,6 +9,36 @@ import pytest +def mock_get_credentials(*args, **kwargs): + import google.auth.credentials + + mock_credentials = mock.create_autospec(google.auth.credentials.Credentials) + return mock_credentials, "default-project" + + +@pytest.fixture +def mock_service_account_credentials(): + import google.oauth2.service_account + + mock_credentials = mock.create_autospec(google.oauth2.service_account.Credentials) + return mock_credentials + + +@pytest.fixture +def mock_compute_engine_credentials(): + import google.auth.compute_engine + + mock_credentials = mock.create_autospec(google.auth.compute_engine.Credentials) + return mock_credentials + + +@pytest.fixture(autouse=True) +def no_auth(monkeypatch): + import pydata_google_auth + + monkeypatch.setattr(pydata_google_auth, "default", mock_get_credentials) + + @pytest.fixture(autouse=True, scope="function") def reset_context(): import pandas_gbq @@ -20,41 +50,12 @@ def reset_context(): @pytest.fixture(autouse=True) def mock_bigquery_client(monkeypatch): import google.cloud.bigquery - import google.cloud.bigquery.table mock_client = mock.create_autospec(google.cloud.bigquery.Client) # Constructor returns the mock itself, so this mock can be treated as the # constructor or the instance. mock_client.return_value = mock_client - - mock_query = mock.create_autospec(google.cloud.bigquery.QueryJob) - mock_query.job_id = "some-random-id" - mock_query.state = "DONE" - mock_rows = mock.create_autospec(google.cloud.bigquery.table.RowIterator) - mock_rows.total_rows = 1 - - mock_rows.__iter__.return_value = [(1,)] - mock_query.result.return_value = mock_rows - mock_client.list_rows.return_value = mock_rows - mock_client.query.return_value = mock_query - # Mock table creation. monkeypatch.setattr(google.cloud.bigquery, "Client", mock_client) mock_client.reset_mock() - # Mock out SELECT 1 query results. - def generate_schema(): - query = mock_client.query.call_args[0][0] if mock_client.query.call_args else "" - if query == "SELECT 1 AS int_col": - return [google.cloud.bigquery.SchemaField("int_col", "INTEGER")] - else: - return [google.cloud.bigquery.SchemaField("_f0", "INTEGER")] - - type(mock_rows).schema = mock.PropertyMock(side_effect=generate_schema) - - # Mock out get_table. - def get_table(table_ref_or_id, **kwargs): - return google.cloud.bigquery.Table(table_ref_or_id) - - mock_client.get_table.side_effect = get_table - return mock_client diff --git a/tests/unit/test_auth.py b/tests/unit/test_auth.py index c101942e..d44c6380 100644 --- a/tests/unit/test_auth.py +++ b/tests/unit/test_auth.py @@ -28,35 +28,32 @@ def test_get_credentials_default_credentials(monkeypatch): import google.auth import google.auth.credentials import google.cloud.bigquery + import pydata_google_auth - def mock_default_credentials(scopes=None, request=None): - return ( - mock.create_autospec(google.auth.credentials.Credentials), - "default-project", - ) + mock_user_credentials = mock.create_autospec(google.auth.credentials.Credentials) + + def mock_default_credentials(scopes, **kwargs): + return (mock_user_credentials, "test-project") - monkeypatch.setattr(google.auth, "default", mock_default_credentials) + monkeypatch.setattr(pydata_google_auth, "default", mock_default_credentials) credentials, project = auth.get_credentials() - assert project == "default-project" + assert project == "test-project" assert credentials is not None def test_get_credentials_load_user_no_default(monkeypatch): import google.auth import google.auth.credentials + import pydata_google_auth import pydata_google_auth.cache - def mock_default_credentials(scopes=None, request=None): - return (None, None) - - monkeypatch.setattr(google.auth, "default", mock_default_credentials) mock_user_credentials = mock.create_autospec(google.auth.credentials.Credentials) - mock_cache = mock.create_autospec(pydata_google_auth.cache.CredentialsCache) - mock_cache.load.return_value = mock_user_credentials + def mock_default_credentials(scopes, **kwargs): + return (mock_user_credentials, None) - monkeypatch.setattr(auth, "get_credentials_cache", lambda _: mock_cache) + monkeypatch.setattr(pydata_google_auth, "default", mock_default_credentials) credentials, project = auth.get_credentials() assert project is None diff --git a/tests/unit/test_context.py b/tests/unit/test_context.py index c0521745..1cf420f0 100644 --- a/tests/unit/test_context.py +++ b/tests/unit/test_context.py @@ -6,9 +6,25 @@ from unittest import mock +import google.cloud.bigquery +import google.cloud.bigquery.table import pytest +@pytest.fixture(autouse=True) +def default_bigquery_client(mock_bigquery_client): + mock_query = mock.create_autospec(google.cloud.bigquery.QueryJob) + mock_query.job_id = "some-random-id" + mock_query.state = "DONE" + mock_rows = mock.create_autospec(google.cloud.bigquery.table.RowIterator) + mock_rows.total_rows = 1 + mock_rows.__iter__.return_value = [(1,)] + mock_query.result.return_value = mock_rows + mock_bigquery_client.list_rows.return_value = mock_rows + mock_bigquery_client.query.return_value = mock_query + return mock_bigquery_client + + @pytest.fixture(autouse=True) def mock_get_credentials(monkeypatch): from pandas_gbq import auth diff --git a/tests/unit/test_features.py b/tests/unit/test_features.py index b10b0fa8..d62480f3 100644 --- a/tests/unit/test_features.py +++ b/tests/unit/test_features.py @@ -10,6 +10,7 @@ @pytest.fixture(autouse=True) def fresh_bigquery_version(monkeypatch): monkeypatch.setattr(FEATURES, "_bigquery_installed_version", None) + monkeypatch.setattr(FEATURES, "_pandas_installed_version", None) @pytest.mark.parametrize( @@ -28,3 +29,21 @@ def test_bigquery_has_from_dataframe_with_csv(monkeypatch, bigquery_version, exp monkeypatch.setattr(google.cloud.bigquery, "__version__", bigquery_version) assert FEATURES.bigquery_has_from_dataframe_with_csv == expected + + +@pytest.mark.parametrize( + ["pandas_version", "expected"], + [ + ("0.14.7", False), + ("0.22.1", False), + ("0.23.0", True), + ("0.23.1", True), + ("1.0.0", True), + ("2.1.3", True), + ], +) +def test_pandas_has_deprecated_verbose(monkeypatch, pandas_version, expected): + import pandas + + monkeypatch.setattr(pandas, "__version__", pandas_version) + assert FEATURES.pandas_has_deprecated_verbose == expected diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index df9241bc..8740f618 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -32,34 +32,40 @@ def mock_get_credentials_no_project(*args, **kwargs): return mock_credentials, None -def mock_get_credentials(*args, **kwargs): - import google.auth.credentials - - mock_credentials = mock.create_autospec(google.auth.credentials.Credentials) - return mock_credentials, "default-project" - - -@pytest.fixture -def mock_service_account_credentials(): - import google.oauth2.service_account - - mock_credentials = mock.create_autospec(google.oauth2.service_account.Credentials) - return mock_credentials - - -@pytest.fixture -def mock_compute_engine_credentials(): - import google.auth.compute_engine +@pytest.fixture(autouse=True) +def default_bigquery_client(mock_bigquery_client): + mock_query = mock.create_autospec(google.cloud.bigquery.QueryJob) + mock_query.job_id = "some-random-id" + mock_query.state = "DONE" + mock_rows = mock.create_autospec(google.cloud.bigquery.table.RowIterator) + mock_rows.total_rows = 1 + + mock_rows.__iter__.return_value = [(1,)] + mock_query.result.return_value = mock_rows + mock_bigquery_client.list_rows.return_value = mock_rows + mock_bigquery_client.query.return_value = mock_query + + # Mock out SELECT 1 query results. + def generate_schema(): + query = ( + mock_bigquery_client.query.call_args[0][0] + if mock_bigquery_client.query.call_args + else "" + ) + if query == "SELECT 1 AS int_col": + return [google.cloud.bigquery.SchemaField("int_col", "INTEGER")] + else: + return [google.cloud.bigquery.SchemaField("_f0", "INTEGER")] - mock_credentials = mock.create_autospec(google.auth.compute_engine.Credentials) - return mock_credentials + type(mock_rows).schema = mock.PropertyMock(side_effect=generate_schema) + # Mock out get_table. + def get_table(table_ref_or_id, **kwargs): + return google.cloud.bigquery.Table(table_ref_or_id) -@pytest.fixture(autouse=True) -def no_auth(monkeypatch): - import pydata_google_auth + mock_bigquery_client.get_table.side_effect = get_table - monkeypatch.setattr(pydata_google_auth, "default", mock_get_credentials) + return mock_bigquery_client @pytest.mark.parametrize( @@ -290,7 +296,7 @@ def test_to_gbq_w_project_table(mock_bigquery_client): assert table.project == "project_table" -def test_to_gbq_creates_dataset(mock_bigquery_client): +def test_to_gbq_create_dataset(mock_bigquery_client): import google.api_core.exceptions mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( @@ -303,6 +309,111 @@ def test_to_gbq_creates_dataset(mock_bigquery_client): mock_bigquery_client.create_dataset.assert_called_with(mock.ANY) +def test_dataset_create_already_exists_translates_exception(mock_bigquery_client): + connector = gbq._Dataset("my-project") + connector.client = mock_bigquery_client + mock_bigquery_client.get_dataset.return_value = object() + with pytest.raises(gbq.DatasetCreationError): + connector.create("already_exists") + + +def test_dataset_exists_false(mock_bigquery_client): + connector = gbq._Dataset("my-project") + connector.client = mock_bigquery_client + mock_bigquery_client.get_dataset.side_effect = google.api_core.exceptions.NotFound( + "nope" + ) + assert not connector.exists("not_exists") + + +def test_dataset_exists_true(mock_bigquery_client): + connector = gbq._Dataset("my-project") + connector.client = mock_bigquery_client + mock_bigquery_client.get_dataset.return_value = object() + assert connector.exists("yes_exists") + + +def test_dataset_exists_translates_exception(mock_bigquery_client): + connector = gbq._Dataset("my-project") + connector.client = mock_bigquery_client + mock_bigquery_client.get_dataset.side_effect = google.api_core.exceptions.InternalServerError( + "something went wrong" + ) + with pytest.raises(gbq.GenericGBQException): + connector.exists("not_gonna_work") + + +def test_table_create_already_exists(mock_bigquery_client): + connector = gbq._Table("my-project", "my_dataset") + connector.client = mock_bigquery_client + mock_bigquery_client.get_table.return_value = object() + with pytest.raises(gbq.TableCreationError): + connector.create( + "already_exists", {"fields": [{"name": "f", "type": "STRING"}]} + ) + + +def test_table_create_translates_exception(mock_bigquery_client): + connector = gbq._Table("my-project", "my_dataset") + connector.client = mock_bigquery_client + mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( + "nope" + ) + mock_bigquery_client.create_table.side_effect = google.api_core.exceptions.InternalServerError( + "something went wrong" + ) + with pytest.raises(gbq.GenericGBQException): + connector.create( + "not_gonna_work", {"fields": [{"name": "f", "type": "STRING"}]} + ) + + +def test_table_delete_notfound_ok(mock_bigquery_client): + connector = gbq._Table("my-project", "my_dataset") + connector.client = mock_bigquery_client + mock_bigquery_client.delete_table.side_effect = google.api_core.exceptions.NotFound( + "nope" + ) + connector.delete("not_exists") + mock_bigquery_client.delete_table.assert_called_once() + + +def test_table_delete_translates_exception(mock_bigquery_client): + connector = gbq._Table("my-project", "my_dataset") + connector.client = mock_bigquery_client + mock_bigquery_client.delete_table.side_effect = google.api_core.exceptions.InternalServerError( + "something went wrong" + ) + with pytest.raises(gbq.GenericGBQException): + connector.delete("not_gonna_work") + + +def test_table_exists_false(mock_bigquery_client): + connector = gbq._Table("my-project", "my_dataset") + connector.client = mock_bigquery_client + mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( + "nope" + ) + assert not connector.exists("not_exists") + + +def test_table_exists_true(mock_bigquery_client): + connector = gbq._Table("my-project", "my_dataset") + connector.client = mock_bigquery_client + mock_bigquery_client.get_table.return_value = object() + assert connector.exists("yes_exists") + + +def test_table_exists_translates_exception(mock_bigquery_client): + connector = gbq._Table("my-project", "my_dataset") + connector.client = mock_bigquery_client + mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.InternalServerError( + "something went wrong" + ) + with pytest.raises(gbq.GenericGBQException): + connector.exists("not_gonna_work") + + def test_read_gbq_with_no_project_id_given_should_fail(monkeypatch): import pydata_google_auth diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index 8e18cfb9..24f262e6 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -16,6 +16,7 @@ import pandas.testing import pytest +from pandas_gbq import exceptions from pandas_gbq.features import FEATURES from pandas_gbq import load @@ -95,6 +96,85 @@ def test_encode_chunks_with_chunksize_none(): assert len(chunk.index) == 6 +def test_load_csv_from_dataframe_allows_client_to_generate_schema(mock_bigquery_client): + import google.cloud.bigquery + + df = pandas.DataFrame({"int_col": [1, 2, 3]}) + destination = google.cloud.bigquery.TableReference.from_string( + "my-project.my_dataset.my_table" + ) + + _ = list( + load.load_csv_from_dataframe( + mock_bigquery_client, df, destination, None, None, None + ) + ) + + mock_load = mock_bigquery_client.load_table_from_dataframe + assert mock_load.called + _, kwargs = mock_load.call_args + assert "job_config" in kwargs + assert kwargs["job_config"].schema is None + + +def test_load_csv_from_file_generates_schema(mock_bigquery_client): + import google.cloud.bigquery + + df = pandas.DataFrame( + { + "int_col": [1, 2, 3], + "bool_col": [True, False, True], + "float_col": [0.0, 1.25, -2.75], + "string_col": ["a", "b", "c"], + "datetime_col": pandas.Series( + [ + "2021-12-21 13:28:40.123789", + "2000-01-01 11:10:09", + "2040-10-31 23:59:59.999999", + ], + dtype="datetime64[ns]", + ), + "timestamp_col": pandas.Series( + [ + "2021-12-21 13:28:40.123789", + "2000-01-01 11:10:09", + "2040-10-31 23:59:59.999999", + ], + dtype="datetime64[ns]", + ).dt.tz_localize(datetime.timezone.utc), + } + ) + destination = google.cloud.bigquery.TableReference.from_string( + "my-project.my_dataset.my_table" + ) + + _ = list( + load.load_csv_from_file(mock_bigquery_client, df, destination, None, None, None) + ) + + mock_load = mock_bigquery_client.load_table_from_file + assert mock_load.called + _, kwargs = mock_load.call_args + assert "job_config" in kwargs + sent_schema = kwargs["job_config"].schema + assert len(sent_schema) == len(df.columns) + assert sent_schema[0].name == "int_col" + assert sent_schema[0].field_type == "INTEGER" + assert sent_schema[1].name == "bool_col" + assert sent_schema[1].field_type == "BOOLEAN" + assert sent_schema[2].name == "float_col" + assert sent_schema[2].field_type == "FLOAT" + assert sent_schema[3].name == "string_col" + assert sent_schema[3].field_type == "STRING" + # TODO: Disambiguate TIMESTAMP from DATETIME based on if column is + # localized or at least use field type from table metadata. See: + # https://github.com/googleapis/python-bigquery-pandas/issues/450 + assert sent_schema[4].name == "datetime_col" + assert sent_schema[4].field_type == "TIMESTAMP" + assert sent_schema[5].name == "timestamp_col" + assert sent_schema[5].field_type == "TIMESTAMP" + + @pytest.mark.parametrize( ["bigquery_has_from_dataframe_with_csv", "api_method"], [(True, "load_parquet"), (True, "load_csv"), (False, "load_csv")], @@ -143,6 +223,39 @@ def test_load_chunks_with_invalid_api_method(): load.load_chunks(None, None, None, api_method="not_a_thing") +def test_load_parquet_allows_client_to_generate_schema(mock_bigquery_client): + import google.cloud.bigquery + + df = pandas.DataFrame({"int_col": [1, 2, 3]}) + destination = google.cloud.bigquery.TableReference.from_string( + "my-project.my_dataset.my_table" + ) + + load.load_parquet(mock_bigquery_client, df, destination, None, None) + + mock_load = mock_bigquery_client.load_table_from_dataframe + assert mock_load.called + _, kwargs = mock_load.call_args + assert "job_config" in kwargs + assert kwargs["job_config"].schema is None + + +def test_load_parquet_with_bad_conversion(mock_bigquery_client): + import google.cloud.bigquery + import pyarrow + + mock_bigquery_client.load_table_from_dataframe.side_effect = ( + pyarrow.lib.ArrowInvalid() + ) + df = pandas.DataFrame({"int_col": [1, 2, 3]}) + destination = google.cloud.bigquery.TableReference.from_string( + "my-project.my_dataset.my_table" + ) + + with pytest.raises(exceptions.ConversionError): + load.load_parquet(mock_bigquery_client, df, destination, None, None) + + @pytest.mark.parametrize( ("numeric_type",), ( diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 743ddc26..d31ac2e9 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -3,7 +3,9 @@ # license that can be found in the LICENSE file. import datetime +from typing import Any, Dict, List +import google.cloud.bigquery import pandas import pytest @@ -151,3 +153,38 @@ def test_generate_bq_schema(module_under_test, dataframe, expected_schema): def test_update_schema(module_under_test, schema_old, schema_new, expected_output): output = module_under_test.update_schema(schema_old, schema_new) assert output == expected_output + + +@pytest.mark.parametrize( + ["bq_schema", "expected"], + [ + ([], {"fields": []}), + ( + [google.cloud.bigquery.SchemaField("test_col", "STRING")], + {"fields": [{"name": "test_col", "type": "STRING", "mode": "NULLABLE"}]}, + ), + ( + [google.cloud.bigquery.SchemaField("test_col", "STRING", mode="REQUIRED")], + {"fields": [{"name": "test_col", "type": "STRING", "mode": "REQUIRED"}]}, + ), + ( + [ + google.cloud.bigquery.SchemaField("test1", "STRING"), + google.cloud.bigquery.SchemaField("test2", "INTEGER"), + ], + { + "fields": [ + {"name": "test1", "type": "STRING", "mode": "NULLABLE"}, + {"name": "test2", "type": "INTEGER", "mode": "NULLABLE"}, + ] + }, + ), + ], +) +def test_to_pandas_gbq( + bq_schema: List[google.cloud.bigquery.SchemaField], expected: Dict[str, Any] +): + import pandas_gbq.schema + + result = pandas_gbq.schema.to_pandas_gbq(bq_schema) + assert result == expected diff --git a/tests/unit/test_timestamp.py b/tests/unit/test_timestamp.py index 406643d0..b35c1307 100644 --- a/tests/unit/test_timestamp.py +++ b/tests/unit/test_timestamp.py @@ -56,6 +56,14 @@ def test_localize_df_with_timestamp_column(module_under_test): dtype="datetime64[ns]", ), "float_col": [0.1, 0.2, 0.3], + "repeated_col": pandas.Series( + [ + ["2011-01-01 01:02:03"], + ["2012-02-02 04:05:06"], + ["2013-03-03 07:08:09"], + ], + dtype="object", + ), } ) expected = df.copy() @@ -64,6 +72,7 @@ def test_localize_df_with_timestamp_column(module_under_test): {"name": "integer_col", "type": "INTEGER"}, {"name": "timestamp_col", "type": "TIMESTAMP"}, {"name": "float_col", "type": "FLOAT"}, + {"name": "repeated_col", "type": "TIMESTAMP", "mode": "REPEATED"}, ] localized = module_under_test.localize_df(df, bq_schema) diff --git a/tests/unit/test_to_gbq.py b/tests/unit/test_to_gbq.py new file mode 100644 index 00000000..e488bdb5 --- /dev/null +++ b/tests/unit/test_to_gbq.py @@ -0,0 +1,122 @@ +# Copyright (c) 2021 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +import google.cloud.bigquery +import google.api_core.exceptions +from pandas import DataFrame +import pytest + +from pandas_gbq import gbq +from pandas_gbq.features import FEATURES + + +@pytest.fixture +def expected_load_method(mock_bigquery_client): + if FEATURES.pandas_has_parquet_with_lossless_timestamp: + return mock_bigquery_client.load_table_from_dataframe + return mock_bigquery_client.load_table_from_file + + +def test_to_gbq_create_dataset_with_location(mock_bigquery_client): + mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( + "my_table" + ) + mock_bigquery_client.get_dataset.side_effect = google.api_core.exceptions.NotFound( + "my_dataset" + ) + gbq.to_gbq( + DataFrame([[1]]), "my_dataset.my_table", project_id="1234", location="us-west1" + ) + assert mock_bigquery_client.create_dataset.called + args, _ = mock_bigquery_client.create_dataset.call_args + sent_dataset = args[0] + assert sent_dataset.location == "us-west1" + + +def test_to_gbq_create_dataset_translates_exception(mock_bigquery_client): + mock_bigquery_client.get_table.side_effect = google.api_core.exceptions.NotFound( + "my_table" + ) + mock_bigquery_client.get_dataset.side_effect = google.api_core.exceptions.NotFound( + "my_dataset" + ) + mock_bigquery_client.create_dataset.side_effect = google.api_core.exceptions.InternalServerError( + "something went wrong" + ) + + with pytest.raises(gbq.GenericGBQException): + gbq.to_gbq(DataFrame([[1]]), "my_dataset.my_table", project_id="1234") + + +def test_to_gbq_with_if_exists_append(mock_bigquery_client, expected_load_method): + from google.cloud.bigquery import SchemaField + + mock_bigquery_client.get_table.return_value = google.cloud.bigquery.Table( + "myproj.my_dataset.my_table", + schema=( + SchemaField("col_a", "FLOAT", mode="REQUIRED"), + SchemaField("col_b", "STRING", mode="REQUIRED"), + ), + ) + gbq.to_gbq( + DataFrame({"col_a": [0.25, 1.5, -1.0], "col_b": ["a", "b", "c"]}), + "my_dataset.my_table", + project_id="myproj", + if_exists="append", + ) + expected_load_method.assert_called_once() + + +def test_to_gbq_with_if_exists_append_mismatch(mock_bigquery_client): + from google.cloud.bigquery import SchemaField + + mock_bigquery_client.get_table.return_value = google.cloud.bigquery.Table( + "myproj.my_dataset.my_table", + schema=(SchemaField("col_a", "INTEGER"), SchemaField("col_b", "STRING")), + ) + with pytest.raises(gbq.InvalidSchema) as exception_block: + gbq.to_gbq( + DataFrame({"col_a": [0.25, 1.5, -1.0]}), + "my_dataset.my_table", + project_id="myproj", + if_exists="append", + ) + + exc = exception_block.value + assert exc.remote_schema == { + "fields": [ + {"name": "col_a", "type": "INTEGER", "mode": "NULLABLE"}, + {"name": "col_b", "type": "STRING", "mode": "NULLABLE"}, + ] + } + assert exc.local_schema == {"fields": [{"name": "col_a", "type": "FLOAT"}]} + + +def test_to_gbq_with_if_exists_replace(mock_bigquery_client): + mock_bigquery_client.get_table.side_effect = ( + # Initial check + google.cloud.bigquery.Table("myproj.my_dataset.my_table"), + # Recreate check + google.api_core.exceptions.NotFound("my_table"), + ) + gbq.to_gbq( + DataFrame([[1]]), + "my_dataset.my_table", + project_id="myproj", + if_exists="replace", + ) + # TODO: We can avoid these API calls by using write disposition in the load + # job. See: https://github.com/googleapis/python-bigquery-pandas/issues/118 + assert mock_bigquery_client.delete_table.called + assert mock_bigquery_client.create_table.called + + +def test_to_gbq_with_if_exists_unknown(): + with pytest.raises(ValueError): + gbq.to_gbq( + DataFrame([[1]]), + "my_dataset.my_table", + project_id="myproj", + if_exists="unknown", + )