Skip to content

In gbq.to_gbq allow the DataFrame column order to differ from schema #14202

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4579,8 +4579,7 @@ a ``TableCreationError`` if the destination table already exists.

If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will
be written to the table using the defined table schema and column types. The
dataframe must match the destination table in column order, structure, and
data types.
dataframe must match the destination table in structure and data types.
If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a
different schema, a delay of 2 minutes will be forced to ensure that the new schema
has propagated in the Google environment. See
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci
Google BigQuery Enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs <io.bigquery_reader>` for more details (:issue:`13615`).
The :func:`pandas.io.gbq.to_gbq` method now allows the DataFrame column order to differ from the destination table schema (:issue:`11359`).

.. _whatsnew_0190.errstate:

Expand Down
18 changes: 11 additions & 7 deletions pandas/io/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,12 +547,17 @@ def verify_schema(self, dataset_id, table_id, schema):
from apiclient.errors import HttpError

try:
return (self.service.tables().get(
remote_schema = self.service.tables().get(
projectId=self.project_id,
datasetId=dataset_id,
tableId=table_id
).execute()['schema']) == schema
tableId=table_id).execute()['schema']

fields_remote = set([json.dumps(field_remote)
for field_remote in remote_schema['fields']])
fields_local = set(json.dumps(field_local)
for field_local in schema['fields'])

return fields_remote == fields_local
except HttpError as ex:
self.process_http_error(ex)

Expand Down Expand Up @@ -819,10 +824,9 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
dataset_id, table_id, table_schema)
elif if_exists == 'append':
if not connector.verify_schema(dataset_id, table_id, table_schema):
raise InvalidSchema("Please verify that the column order, "
"structure and data types in the "
"DataFrame match the schema of the "
"destination table.")
raise InvalidSchema("Please verify that the structure and "
"data types in the DataFrame match the "
"schema of the destination table.")
else:
table.create(table_id, table_schema)

Expand Down
65 changes: 65 additions & 0 deletions pandas/io/tests/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,8 @@ def setUp(self):
private_key=_get_private_key_path())
self.table = gbq._Table(_get_project_id(), DATASET_ID + "1",
private_key=_get_private_key_path())
self.sut = gbq.GbqConnector(_get_project_id(),
private_key=_get_private_key_path())

@classmethod
def tearDownClass(cls):
Expand Down Expand Up @@ -906,6 +908,69 @@ def test_list_table(self):
'Expected table list to contain table {0}'
.format(destination_table))

def test_verify_schema_allows_flexible_column_order(self):
destination_table = TABLE_ID + "10"
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'B', 'type': 'FLOAT'},
{'name': 'D', 'type': 'TIMESTAMP'}]}

self.table.create(destination_table, test_schema_1)
self.assertTrue(self.sut.verify_schema(
DATASET_ID + "1", destination_table, test_schema_2),
'Expected schema to match')

def test_verify_schema_fails_different_data_type(self):
destination_table = TABLE_ID + "11"
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B', 'type': 'STRING'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}

self.table.create(destination_table, test_schema_1)
self.assertFalse(self.sut.verify_schema(
DATASET_ID + "1", destination_table, test_schema_2),
'Expected different schema')

def test_verify_schema_fails_different_structure(self):
destination_table = TABLE_ID + "12"
test_schema_1 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}
test_schema_2 = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B2', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}

self.table.create(destination_table, test_schema_1)
self.assertFalse(self.sut.verify_schema(
DATASET_ID + "1", destination_table, test_schema_2),
'Expected different schema')

def test_upload_data_flexible_column_order(self):
destination_table = DESTINATION_TABLE + "13"

test_size = 10
df = make_mixed_dataframe_v2(test_size)

# Initialize table with sample data
gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000,
private_key=_get_private_key_path())

df_columns_reversed = df[df.columns[::-1]]

gbq.to_gbq(df_columns_reversed, destination_table, _get_project_id(),
if_exists='append', private_key=_get_private_key_path())

def test_list_dataset(self):
dataset_id = DATASET_ID + "1"
self.assertTrue(dataset_id in self.dataset.datasets(),
Expand Down