diff --git a/doc/source/io.rst b/doc/source/io.rst index 96ec624f4fd3c..d436fa52918d3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4579,8 +4579,7 @@ a ``TableCreationError`` if the destination table already exists. If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will be written to the table using the defined table schema and column types. The - dataframe must match the destination table in column order, structure, and - data types. + dataframe must match the destination table in structure and data types. If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a different schema, a delay of 2 minutes will be forced to ensure that the new schema has propagated in the Google environment. See diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f3a6736ff9920..3fcc848f4c225 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -397,6 +397,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci Google BigQuery Enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). +The :func:`pandas.io.gbq.to_gbq` method now allows the DataFrame column order to differ from the destination table schema (:issue:`11359`). .. _whatsnew_0190.errstate: diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 8f23e82daf2e3..d6f8660f20ef6 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -547,12 +547,17 @@ def verify_schema(self, dataset_id, table_id, schema): from apiclient.errors import HttpError try: - return (self.service.tables().get( + remote_schema = self.service.tables().get( projectId=self.project_id, datasetId=dataset_id, - tableId=table_id - ).execute()['schema']) == schema + tableId=table_id).execute()['schema'] + fields_remote = set([json.dumps(field_remote) + for field_remote in remote_schema['fields']]) + fields_local = set(json.dumps(field_local) + for field_local in schema['fields']) + + return fields_remote == fields_local except HttpError as ex: self.process_http_error(ex) @@ -819,10 +824,9 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, dataset_id, table_id, table_schema) elif if_exists == 'append': if not connector.verify_schema(dataset_id, table_id, table_schema): - raise InvalidSchema("Please verify that the column order, " - "structure and data types in the " - "DataFrame match the schema of the " - "destination table.") + raise InvalidSchema("Please verify that the structure and " + "data types in the DataFrame match the " + "schema of the destination table.") else: table.create(table_id, table_schema) diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 921fd824d6ffd..0ea4b5204e150 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -743,6 +743,8 @@ def setUp(self): private_key=_get_private_key_path()) self.table = gbq._Table(_get_project_id(), DATASET_ID + "1", private_key=_get_private_key_path()) + self.sut = gbq.GbqConnector(_get_project_id(), + private_key=_get_private_key_path()) @classmethod def tearDownClass(cls): @@ -906,6 +908,69 @@ def test_list_table(self): 'Expected table list to contain table {0}' .format(destination_table)) + def test_verify_schema_allows_flexible_column_order(self): + destination_table = TABLE_ID + "10" + test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'B', 'type': 'FLOAT'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} + test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'B', 'type': 'FLOAT'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} + + self.table.create(destination_table, test_schema_1) + self.assertTrue(self.sut.verify_schema( + DATASET_ID + "1", destination_table, test_schema_2), + 'Expected schema to match') + + def test_verify_schema_fails_different_data_type(self): + destination_table = TABLE_ID + "11" + test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'B', 'type': 'FLOAT'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} + test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'B', 'type': 'STRING'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} + + self.table.create(destination_table, test_schema_1) + self.assertFalse(self.sut.verify_schema( + DATASET_ID + "1", destination_table, test_schema_2), + 'Expected different schema') + + def test_verify_schema_fails_different_structure(self): + destination_table = TABLE_ID + "12" + test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'B', 'type': 'FLOAT'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} + test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'B2', 'type': 'FLOAT'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} + + self.table.create(destination_table, test_schema_1) + self.assertFalse(self.sut.verify_schema( + DATASET_ID + "1", destination_table, test_schema_2), + 'Expected different schema') + + def test_upload_data_flexible_column_order(self): + destination_table = DESTINATION_TABLE + "13" + + test_size = 10 + df = make_mixed_dataframe_v2(test_size) + + # Initialize table with sample data + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) + + df_columns_reversed = df[df.columns[::-1]] + + gbq.to_gbq(df_columns_reversed, destination_table, _get_project_id(), + if_exists='append', private_key=_get_private_key_path()) + def test_list_dataset(self): dataset_id = DATASET_ID + "1" self.assertTrue(dataset_id in self.dataset.datasets(),