From 547812eaf46f4c8a440d8ae0fdde3af8cd81fd2c Mon Sep 17 00:00:00 2001 From: John Paton Date: Tue, 12 Mar 2019 19:17:15 +0100 Subject: [PATCH] ENH: Allow partial table schema in to_gbq() table_schema (#218) (#257) * ENH: Allow partial table schema in to_gbq * CLN: applied black * BUG: make update_schema python 2.7 compatible * DOC: update docs to allow for a subset of columns in to_gbq table_schema * DOC: what's new * DOC: close parens around issue in changelog --- docs/source/changelog.rst | 7 +++++- pandas_gbq/gbq.py | 21 +++++++++++++----- pandas_gbq/schema.py | 29 ++++++++++++++++++++++++ tests/unit/test_schema.py | 46 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+), 6 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 3b43ccd3..e3c0edd7 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -20,6 +20,11 @@ Internal changes - Use ``to_dataframe()`` from ``google-cloud-bigquery`` in the ``read_gbq()`` function. (:issue:`247`) +Enhancements +~~~~~~~~~~~~ +- Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns, + with the rest being populated using the DataFrame dtypes (:issue:`218`) + (contributed by @johnpaton) .. _changelog-0.9.0: @@ -237,4 +242,4 @@ Initial release of transfered code from `pandas `__ -- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 `__, and `pandas-GH#14305 `__ \ No newline at end of file +- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 `__, and `pandas-GH#14305 `__ diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index b59c3f94..2fa31e4f 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -939,9 +939,11 @@ def to_gbq( 'STRING'},...]``. If schema is not provided, it will be generated according to dtypes of DataFrame columns. - If schema is provided, it must contain all DataFrame columns. - pandas_gbq.gbq._generate_bq_schema() may be used to create an initial - schema, though it doesn't preserve column order. + If schema is provided, it may contain all or a subset of DataFrame + columns. If a subset is provided, the rest will be inferred from + the DataFrame dtypes. + pandas_gbq.gbq._generate_bq_schema() may be used to create an + initial schema, though it doesn't preserve column order. See BigQuery API documentation on available names of a field. .. versionadded:: 0.3.1 @@ -1023,10 +1025,13 @@ def to_gbq( credentials=connector.credentials, ) + default_schema = _generate_bq_schema(dataframe) if not table_schema: - table_schema = _generate_bq_schema(dataframe) + table_schema = default_schema else: - table_schema = dict(fields=table_schema) + table_schema = _update_bq_schema( + default_schema, dict(fields=table_schema) + ) # If table exists, check if_exists parameter if table.exists(table_id): @@ -1091,6 +1096,12 @@ def _generate_bq_schema(df, default_type="STRING"): return schema.generate_bq_schema(df, default_type=default_type) +def _update_bq_schema(schema_old, schema_new): + from pandas_gbq import schema + + return schema.update_schema(schema_old, schema_new) + + class _Table(GbqConnector): def __init__( self, diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py index 3ca03025..c59ed68e 100644 --- a/pandas_gbq/schema.py +++ b/pandas_gbq/schema.py @@ -31,3 +31,32 @@ def generate_bq_schema(dataframe, default_type="STRING"): ) return {"fields": fields} + + +def update_schema(schema_old, schema_new): + """ + Given an old BigQuery schema, update it with a new one. + + Where a field name is the same, the new will replace the old. Any + new fields not present in the old schema will be added. + + Arguments: + schema_old: the old schema to update + schema_new: the new schema which will overwrite/extend the old + """ + old_fields = schema_old["fields"] + new_fields = schema_new["fields"] + output_fields = list(old_fields) + + field_indices = {field["name"]: i for i, field in enumerate(output_fields)} + + for field in new_fields: + name = field["name"] + if name in field_indices: + # replace old field with new field of same name + output_fields[field_indices[name]] = field + else: + # add new field + output_fields.append(field) + + return {"fields": output_fields} diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 74f22f29..af3b2043 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -54,3 +54,49 @@ def test_generate_bq_schema(dataframe, expected_schema): schema = pandas_gbq.schema.generate_bq_schema(dataframe) assert schema == expected_schema + + +@pytest.mark.parametrize( + "schema_old,schema_new,expected_output", + [ + ( + {"fields": [{"name": "col1", "type": "INTEGER"}]}, + {"fields": [{"name": "col2", "type": "TIMESTAMP"}]}, + { + "fields": [ + {"name": "col1", "type": "INTEGER"}, + {"name": "col2", "type": "TIMESTAMP"}, + ] + }, + ), + ( + {"fields": [{"name": "col1", "type": "INTEGER"}]}, + {"fields": [{"name": "col1", "type": "BOOLEAN"}]}, + {"fields": [{"name": "col1", "type": "BOOLEAN"}]}, + ), + ( + { + "fields": [ + {"name": "col1", "type": "INTEGER"}, + {"name": "col2", "type": "INTEGER"}, + ] + }, + { + "fields": [ + {"name": "col2", "type": "BOOLEAN"}, + {"name": "col3", "type": "FLOAT"}, + ] + }, + { + "fields": [ + {"name": "col1", "type": "INTEGER"}, + {"name": "col2", "type": "BOOLEAN"}, + {"name": "col3", "type": "FLOAT"}, + ] + }, + ), + ], +) +def test_update_schema(schema_old, schema_new, expected_output): + output = pandas_gbq.schema.update_schema(schema_old, schema_new) + assert output == expected_output