Skip to content

Commit

Permalink
ENH: Allow partial table schema in to_gbq() table_schema (#218) (#257)
Browse files Browse the repository at this point in the history
* ENH: Allow partial table schema in to_gbq

* CLN: applied black

* BUG: make update_schema python 2.7 compatible

* DOC: update docs to allow for a subset of columns in to_gbq table_schema

* DOC: what's new

* DOC: close parens around issue in changelog
  • Loading branch information
JohnPaton authored and max-sixty committed Mar 12, 2019
1 parent d06db4b commit 547812e
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 6 deletions.
7 changes: 6 additions & 1 deletion docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ Internal changes
- Use ``to_dataframe()`` from ``google-cloud-bigquery`` in the ``read_gbq()``
function. (:issue:`247`)

Enhancements
~~~~~~~~~~~~
- Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns,
with the rest being populated using the DataFrame dtypes (:issue:`218`)
(contributed by @johnpaton)

.. _changelog-0.9.0:

Expand Down Expand Up @@ -237,4 +242,4 @@ Initial release of transfered code from `pandas <https://github.com/pandas-dev/p
Includes patches since the 0.19.2 release on pandas with the following:

- :func:`read_gbq` now allows query configuration preferences `pandas-GH#14742 <https://github.com/pandas-dev/pandas/pull/14742>`__
- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 <https://github.com/pandas-dev/pandas/pull/14064>`__, and `pandas-GH#14305 <https://github.com/pandas-dev/pandas/pull/14305>`__
- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 <https://github.com/pandas-dev/pandas/pull/14064>`__, and `pandas-GH#14305 <https://github.com/pandas-dev/pandas/pull/14305>`__
21 changes: 16 additions & 5 deletions pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,9 +939,11 @@ def to_gbq(
'STRING'},...]``.
If schema is not provided, it will be
generated according to dtypes of DataFrame columns.
If schema is provided, it must contain all DataFrame columns.
pandas_gbq.gbq._generate_bq_schema() may be used to create an initial
schema, though it doesn't preserve column order.
If schema is provided, it may contain all or a subset of DataFrame
columns. If a subset is provided, the rest will be inferred from
the DataFrame dtypes.
pandas_gbq.gbq._generate_bq_schema() may be used to create an
initial schema, though it doesn't preserve column order.
See BigQuery API documentation on available names of a field.
.. versionadded:: 0.3.1
Expand Down Expand Up @@ -1023,10 +1025,13 @@ def to_gbq(
credentials=connector.credentials,
)

default_schema = _generate_bq_schema(dataframe)
if not table_schema:
table_schema = _generate_bq_schema(dataframe)
table_schema = default_schema
else:
table_schema = dict(fields=table_schema)
table_schema = _update_bq_schema(
default_schema, dict(fields=table_schema)
)

# If table exists, check if_exists parameter
if table.exists(table_id):
Expand Down Expand Up @@ -1091,6 +1096,12 @@ def _generate_bq_schema(df, default_type="STRING"):
return schema.generate_bq_schema(df, default_type=default_type)


def _update_bq_schema(schema_old, schema_new):
from pandas_gbq import schema

return schema.update_schema(schema_old, schema_new)


class _Table(GbqConnector):
def __init__(
self,
Expand Down
29 changes: 29 additions & 0 deletions pandas_gbq/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,32 @@ def generate_bq_schema(dataframe, default_type="STRING"):
)

return {"fields": fields}


def update_schema(schema_old, schema_new):
"""
Given an old BigQuery schema, update it with a new one.
Where a field name is the same, the new will replace the old. Any
new fields not present in the old schema will be added.
Arguments:
schema_old: the old schema to update
schema_new: the new schema which will overwrite/extend the old
"""
old_fields = schema_old["fields"]
new_fields = schema_new["fields"]
output_fields = list(old_fields)

field_indices = {field["name"]: i for i, field in enumerate(output_fields)}

for field in new_fields:
name = field["name"]
if name in field_indices:
# replace old field with new field of same name
output_fields[field_indices[name]] = field
else:
# add new field
output_fields.append(field)

return {"fields": output_fields}
46 changes: 46 additions & 0 deletions tests/unit/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,49 @@
def test_generate_bq_schema(dataframe, expected_schema):
schema = pandas_gbq.schema.generate_bq_schema(dataframe)
assert schema == expected_schema


@pytest.mark.parametrize(
"schema_old,schema_new,expected_output",
[
(
{"fields": [{"name": "col1", "type": "INTEGER"}]},
{"fields": [{"name": "col2", "type": "TIMESTAMP"}]},
{
"fields": [
{"name": "col1", "type": "INTEGER"},
{"name": "col2", "type": "TIMESTAMP"},
]
},
),
(
{"fields": [{"name": "col1", "type": "INTEGER"}]},
{"fields": [{"name": "col1", "type": "BOOLEAN"}]},
{"fields": [{"name": "col1", "type": "BOOLEAN"}]},
),
(
{
"fields": [
{"name": "col1", "type": "INTEGER"},
{"name": "col2", "type": "INTEGER"},
]
},
{
"fields": [
{"name": "col2", "type": "BOOLEAN"},
{"name": "col3", "type": "FLOAT"},
]
},
{
"fields": [
{"name": "col1", "type": "INTEGER"},
{"name": "col2", "type": "BOOLEAN"},
{"name": "col3", "type": "FLOAT"},
]
},
),
],
)
def test_update_schema(schema_old, schema_new, expected_output):
output = pandas_gbq.schema.update_schema(schema_old, schema_new)
assert output == expected_output

0 comments on commit 547812e

Please sign in to comment.