googleapis · artemrys · Dec 18, 2020 · tswast · Jan 8, 2021 · tswast
@@ -12,6 +12,8 @@ Features
 - Load DataFrame with ``to_gbq`` to a table in a project different from the API
   client project. Specify the target table ID as ``project.dataset.table`` to
   use this feature. (:issue:`321`, :issue:`347`)
+- In case of dataframe and BigQuery schemas difference in ``append`` mode (
+  missing field or wrong type field) - show it in the exception. (:issue:`349`)
 
 Dependencies
 ~~~~~~~~~~~~

@@ -1191,11 +1191,15 @@ def to_gbq(
             if not pandas_gbq.schema.schema_is_subset(
                 original_schema, table_schema
             ):
-                raise InvalidSchema(
+                schema_difference = pandas_gbq.schema.schema_difference(
+                    original_schema, table_schema
+                )
+                exception_message = (
                     "Please verify that the structure and "
                     "data types in the DataFrame match the "
-                    "schema of the destination table."
+                    "schema of the destination table.\n" + schema_difference
                 )
+                raise InvalidSchema(exception_message)
 
             # Update the local `table_schema` so mode matches.
             # See: https://github.com/pydata/pandas-gbq/issues/315

@@ -3,6 +3,13 @@
 import copy
 
 
+# String templates used in schemas comparison.
+MISSING_FIELD_TMPL = "Field '{}': no such field in the dataframe."
+DIFFERENT_FIELD_TYPE_TMPL = (
+    "Field '{}' has different types: dataframe '{}', BigQuery '{}'."
+)
+
+
 # API may return data types as legacy SQL, so maintain a mapping of aliases
 # from standard SQL to legacy data types.
 _TYPE_ALIASES = {
@@ -30,7 +37,7 @@ def to_pandas_gbq(client_schema):
 def _clean_schema_fields(fields):
     """Return a sanitized version of the schema for comparisons.
 
-    The ``mode`` and ``description`` properties areis ignored because they
+    The ``mode`` and ``description`` properties are ignored because they
     are not generated by func:`pandas_gbq.schema.generate_bq_schema`.
     """
     fields_sorted = sorted(fields, key=lambda field: field["name"])
@@ -42,8 +49,99 @@ def _clean_schema_fields(fields):
     return clean_schema
 
 
+def schema_difference(schema_remote, schema_local):
+    """Calculates schemas difference and formats the output.
+
+    Parameters
+    ----------
+    schema_remote : dict
+        Schema for comparison. Each item of ``fields`` should have a 'name'
+        and a 'type'
+    schema_local : dict
+        Schema for comparison. Each item of ``fields`` should have a 'name'
+        and a 'type'
+
+    Returns
+    -------
+    str
+        Formatted schema difference output
+    """
+    _schema_difference = _calculate_schema_difference(
+        schema_remote, schema_local
+    )
+    return _format_schema_difference(_schema_difference)
+
+
+def _format_schema_difference(schema_difference):
+    """Formats the schema difference.
+
+    By default it shows only 3 differences. In case of more - it
+    says how many more are left.
+
+    Parameters
+    ----------
+    schema_difference : list[str]
+        List of differences between schemas.
+
+    Returns
+    -------
+    str
+        Formatted schema difference output
+    """
+    if len(schema_difference) < 4:
+        diff_to_show = "\n".join(schema_difference)
+    else:
+        diffs_left = len(schema_difference) - 3
+        schema_difference = schema_difference[:3]
+        if diffs_left != 0:
+            schema_difference.append("And {} more left.".format(diffs_left))
+        diff_to_show = "\n".join(schema_difference)
-    if len(schema_difference) < 4:
-        diff_to_show = "\n".join(schema_difference)
-    else:
-        diffs_left = len(schema_difference) - 3
-        schema_difference = schema_difference[:3]
-        if diffs_left != 0:
-            schema_difference.append("And {} more left.".format(diffs_left))
-        diff_to_show = "\n".join(schema_difference)
+    if len(schema_difference) > 3:
+        diffs_left = len(schema_difference) - 3
+        schema_difference = schema_difference[:3]
+        schema_difference.append("And {} more left.".format(diffs_left))
+    diff_to_show = "\n".join(schema_difference)
-    if len(schema_difference) < 4:
-        diff_to_show = "\n".join(schema_difference)
-    else:
-        diffs_left = len(schema_difference) - 3
-        schema_difference = schema_difference[:3]
-        if diffs_left != 0:
-            schema_difference.append("And {} more left.".format(diffs_left))
-        diff_to_show = "\n".join(schema_difference)
+    if len(schema_difference) > 3:
+        diffs_left = len(schema_difference) - 3
+        schema_difference = schema_difference[:3]
+        schema_difference.append("And {} more left.".format(diffs_left))
+    diff_to_show = "\n".join(schema_difference)
+    return diff_to_show
+
+
+def _calculate_schema_difference(schema_remote, schema_local):
+    """Calculates difference in dataframe and BigQuery schemas.
+
+    Compares dataframe and BigQuery schemas to identify exact differences
+    in each field (field can be missing in the dataframe or field can have
+    a different type).
+
+    Parameters
+    ----------
+    schema_remote : dict
+        Schema for comparison. Each item of ``fields`` should have a 'name'
+        and a 'type'
+    schema_local : dict
+        Schema for comparison. Each item of ``fields`` should have a 'name'
+        and a 'type'
+
+    Returns
+    -------
+    List[str]
+        List of field differences
+    """
+    fields_remote = _clean_schema_fields(schema_remote.get("fields", []))
+    fields_local = _clean_schema_fields(schema_local.get("fields", []))
+    diff = []
+    for field_remote in fields_remote:
+        for field_local in fields_local:
+            if field_local["name"] == field_remote["name"]:
+                if field_local["type"] != field_remote["type"]:
+                    diff.append(
+                        DIFFERENT_FIELD_TYPE_TMPL.format(
+                            field_local["name"],
+                            field_local["type"],
+                            field_remote["type"],
+                        )
+                    )
+                break
+        else:
+            diff.append(MISSING_FIELD_TMPL.format(field_remote["name"]))
+    return diff
+
+
 def schema_is_subset(schema_remote, schema_local):
-    """Indicate whether the schema to be uploaded is a subset
+    """Indicate whether the schema to be uploaded is a subset.
 
     Compare the BigQuery table identified in the parameters with
     the schema passed in and indicate whether a subset of the fields in

@@ -1048,14 +1048,24 @@ def test_upload_data_if_table_exists_append(self, project_id):
         assert result["num_rows"][0] == test_size * 2
 
         # Try inserting with a different schema, confirm failure
-        with pytest.raises(gbq.InvalidSchema):
+        with pytest.raises(gbq.InvalidSchema) as excinfo:
             gbq.to_gbq(
                 df_different_schema,
                 self.destination_table + test_id,
                 project_id,
                 if_exists="append",
                 credentials=self.credentials,
             )
+        expected_message = (
+            "Please verify that the structure and "
+            "data types in the DataFrame match the "
+            "schema of the destination table.\n"
+            "Field 'bools': no such field in the dataframe.\n"
+            "Field 'flts': no such field in the dataframe.\n"
+            "Field 'ints': no such field in the dataframe.\n"
+            "And 2 more left."
+        )
+        assert expected_message == str(excinfo.value)
 
     def test_upload_subset_columns_if_table_exists_append(self, project_id):
         # Issue 24: Upload is succesful if dataframe has columns

@@ -69,6 +69,79 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test):
     assert not module_under_test.schema_is_subset(table_schema, tested_schema)
 
 
+@pytest.mark.parametrize(
+    "original_fields,dataframe_fields,expected_difference",
+    [
+        (
+            [
+                {"name": "A", "type": "FLOAT"},
+                {"name": "B", "type": "FLOAT64"},
+                {"name": "C", "type": "STRING"},
+            ],
+            [
+                {"name": "A", "type": "FLOAT64"},
+                {"name": "B", "type": "FLOAT"},
+            ],
+            "Field 'C': no such field in the dataframe.",
+        ),
+        (
+            [
+                {"name": "A", "type": "FLOAT"},
+                {"name": "B", "type": "STRING"},
+            ],
+            [
+                {"name": "A", "type": "FLOAT64"},
+                {"name": "B", "type": "FLOAT"},
+            ],
+            "Field 'B' has different types: dataframe 'FLOAT', BigQuery 'STRING'.",
+        ),
+        (
+            [
+                {"name": "A", "type": "FLOAT"},
+                {"name": "B", "type": "STRING"},
+                {"name": "C", "type": "STRING"},
+            ],
+            [
+                {"name": "A", "type": "FLOAT64"},
+                {"name": "B", "type": "FLOAT"},
+            ],
+            (
+                "Field 'B' has different types: dataframe 'FLOAT', BigQuery 'STRING'.\n"
+                "Field 'C': no such field in the dataframe."
+            ),
+        ),
+        (
+            [
+                {"name": "A", "type": "FLOAT"},
+                {"name": "B", "type": "STRING"},
+                {"name": "C", "type": "STRING"},
+                {"name": "D", "type": "STRING"},
+                {"name": "E", "type": "STRING"},
+            ],
+            [
+                {"name": "A", "type": "FLOAT64"},
+                {"name": "B", "type": "FLOAT"},
+            ],
+            (
+                "Field 'B' has different types: dataframe 'FLOAT', BigQuery 'STRING'.\n"
+                "Field 'C': no such field in the dataframe.\n"
+                "Field 'D': no such field in the dataframe.\n"
+                "And 1 more left."
+            ),
+        ),
+    ],
+)
+def test_schema_difference(
+    module_under_test, original_fields, dataframe_fields, expected_difference
+):
+    table_schema = {"fields": original_fields}
+    tested_schema = {"fields": dataframe_fields}
+    schema_difference = module_under_test.schema_difference(
+        table_schema, tested_schema
+    )
+    assert expected_difference == schema_difference
+
+
 @pytest.mark.parametrize(
     "dataframe,expected_schema",
     [