ENH: Allow partial table schema in to_gbq() table_schema (#218) (#257)

* ENH: Allow partial table schema in to_gbq * CLN: applied black * BUG: make update_schema python 2.7 compatible * DOC: update docs to allow for a subset of columns in to_gbq table_schema * DOC: what's new * DOC: close parens around issue in changelog
googleapis · Mar 12, 2019 · 547812e · 547812e
1 parent d06db4b
commit 547812e
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 6 deletions.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -20,6 +20,11 @@ Internal changes
 - Use ``to_dataframe()`` from ``google-cloud-bigquery`` in the ``read_gbq()``
   function. (:issue:`247`)
 
+Enhancements
+~~~~~~~~~~~~
+- Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns,
+  with the rest being populated using the DataFrame dtypes (:issue:`218`) 
+  (contributed by @johnpaton)
 
 .. _changelog-0.9.0:
 
@@ -237,4 +242,4 @@ Initial release of transfered code from `pandas <https://github.com/pandas-dev/p
 Includes patches since the 0.19.2 release on pandas with the following:
 
 - :func:`read_gbq` now allows query configuration preferences `pandas-GH#14742 <https://github.com/pandas-dev/pandas/pull/14742>`__
-- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 <https://github.com/pandas-dev/pandas/pull/14064>`__, and `pandas-GH#14305 <https://github.com/pandas-dev/pandas/pull/14305>`__
+- :func:`read_gbq` now stores ``INTEGER`` columns as ``dtype=object`` if they contain ``NULL`` values. Otherwise they are stored as ``int64``. This prevents precision lost for integers greather than 2**53. Furthermore ``FLOAT`` columns with values above 10**4 are no longer casted to ``int64`` which also caused precision loss `pandas-GH#14064 <https://github.com/pandas-dev/pandas/pull/14064>`__, and `pandas-GH#14305 <https://github.com/pandas-dev/pandas/pull/14305>`__
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -939,9 +939,11 @@ def to_gbq(
         'STRING'},...]``.
         If schema is not provided, it will be
         generated according to dtypes of DataFrame columns.
-        If schema is provided, it must contain all DataFrame columns.
-        pandas_gbq.gbq._generate_bq_schema() may be used to create an initial
-        schema, though it doesn't preserve column order.
+        If schema is provided, it may contain all or a subset of DataFrame
+        columns. If a subset is provided, the rest will be inferred from
+        the DataFrame dtypes.
+        pandas_gbq.gbq._generate_bq_schema() may be used to create an
+        initial schema, though it doesn't preserve column order.
         See BigQuery API documentation on available names of a field.
 
         .. versionadded:: 0.3.1
@@ -1023,10 +1025,13 @@ def to_gbq(
         credentials=connector.credentials,
     )
 
+    default_schema = _generate_bq_schema(dataframe)
     if not table_schema:
-        table_schema = _generate_bq_schema(dataframe)
+        table_schema = default_schema
     else:
-        table_schema = dict(fields=table_schema)
+        table_schema = _update_bq_schema(
+            default_schema, dict(fields=table_schema)
+        )
 
     # If table exists, check if_exists parameter
     if table.exists(table_id):
@@ -1091,6 +1096,12 @@ def _generate_bq_schema(df, default_type="STRING"):
     return schema.generate_bq_schema(df, default_type=default_type)
 
 
+def _update_bq_schema(schema_old, schema_new):
+    from pandas_gbq import schema
+
+    return schema.update_schema(schema_old, schema_new)
+
+
 class _Table(GbqConnector):
     def __init__(
         self,

diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py
@@ -31,3 +31,32 @@ def generate_bq_schema(dataframe, default_type="STRING"):
         )
 
     return {"fields": fields}
+
+
+def update_schema(schema_old, schema_new):
+    """
+    Given an old BigQuery schema, update it with a new one.
+
+    Where a field name is the same, the new will replace the old. Any
+    new fields not present in the old schema will be added.
+
+    Arguments:
+        schema_old: the old schema to update
+        schema_new: the new schema which will overwrite/extend the old
+    """
+    old_fields = schema_old["fields"]
+    new_fields = schema_new["fields"]
+    output_fields = list(old_fields)
+
+    field_indices = {field["name"]: i for i, field in enumerate(output_fields)}
+
+    for field in new_fields:
+        name = field["name"]
+        if name in field_indices:
+            # replace old field with new field of same name
+            output_fields[field_indices[name]] = field
+        else:
+            # add new field
+            output_fields.append(field)
+
+    return {"fields": output_fields}
diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py
@@ -54,3 +54,49 @@
 def test_generate_bq_schema(dataframe, expected_schema):
     schema = pandas_gbq.schema.generate_bq_schema(dataframe)
     assert schema == expected_schema
+
+
+@pytest.mark.parametrize(
+    "schema_old,schema_new,expected_output",
+    [
+        (
+            {"fields": [{"name": "col1", "type": "INTEGER"}]},
+            {"fields": [{"name": "col2", "type": "TIMESTAMP"}]},
+            {
+                "fields": [
+                    {"name": "col1", "type": "INTEGER"},
+                    {"name": "col2", "type": "TIMESTAMP"},
+                ]
+            },
+        ),
+        (
+            {"fields": [{"name": "col1", "type": "INTEGER"}]},
+            {"fields": [{"name": "col1", "type": "BOOLEAN"}]},
+            {"fields": [{"name": "col1", "type": "BOOLEAN"}]},
+        ),
+        (
+            {
+                "fields": [
+                    {"name": "col1", "type": "INTEGER"},
+                    {"name": "col2", "type": "INTEGER"},
+                ]
+            },
+            {
+                "fields": [
+                    {"name": "col2", "type": "BOOLEAN"},
+                    {"name": "col3", "type": "FLOAT"},
+                ]
+            },
+            {
+                "fields": [
+                    {"name": "col1", "type": "INTEGER"},
+                    {"name": "col2", "type": "BOOLEAN"},
+                    {"name": "col3", "type": "FLOAT"},
+                ]
+            },
+        ),
+    ],
+)
+def test_update_schema(schema_old, schema_new, expected_output):
+    output = pandas_gbq.schema.update_schema(schema_old, schema_new)
+    assert output == expected_output