From 6cbfb982386d8d8ed4db95e202058228ac1af8b1 Mon Sep 17 00:00:00 2001 From: Artem Rys Date: Fri, 18 Dec 2020 23:57:40 +0100 Subject: [PATCH] Show schemas difference when throwing InvalidSchema exception in ``to_gbq``. --- docs/source/changelog.rst | 2 + pandas_gbq/gbq.py | 8 ++- pandas_gbq/schema.py | 102 +++++++++++++++++++++++++++++++++++++- tests/system/test_gbq.py | 12 ++++- tests/unit/test_schema.py | 73 +++++++++++++++++++++++++++ 5 files changed, 192 insertions(+), 5 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 372a0c61..8446472e 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -12,6 +12,8 @@ Features - Load DataFrame with ``to_gbq`` to a table in a project different from the API client project. Specify the target table ID as ``project.dataset.table`` to use this feature. (:issue:`321`, :issue:`347`) +- In case of dataframe and BigQuery schemas difference in ``append`` mode ( + missing field or wrong type field) - show it in the exception. (:issue:`349`) Dependencies ~~~~~~~~~~~~ diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index bebe7e1e..c178ed35 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1191,11 +1191,15 @@ def to_gbq( if not pandas_gbq.schema.schema_is_subset( original_schema, table_schema ): - raise InvalidSchema( + schema_difference = pandas_gbq.schema.schema_difference( + original_schema, table_schema + ) + exception_message = ( "Please verify that the structure and " "data types in the DataFrame match the " - "schema of the destination table." + "schema of the destination table.\n" + schema_difference ) + raise InvalidSchema(exception_message) # Update the local `table_schema` so mode matches. # See: https://github.com/pydata/pandas-gbq/issues/315 diff --git a/pandas_gbq/schema.py b/pandas_gbq/schema.py index ffc1c362..9b6f4f4b 100644 --- a/pandas_gbq/schema.py +++ b/pandas_gbq/schema.py @@ -3,6 +3,13 @@ import copy +# String templates used in schemas comparison. +MISSING_FIELD_TMPL = "Field '{}': no such field in the dataframe." +DIFFERENT_FIELD_TYPE_TMPL = ( + "Field '{}' has different types: dataframe '{}', BigQuery '{}'." +) + + # API may return data types as legacy SQL, so maintain a mapping of aliases # from standard SQL to legacy data types. _TYPE_ALIASES = { @@ -30,7 +37,7 @@ def to_pandas_gbq(client_schema): def _clean_schema_fields(fields): """Return a sanitized version of the schema for comparisons. - The ``mode`` and ``description`` properties areis ignored because they + The ``mode`` and ``description`` properties are ignored because they are not generated by func:`pandas_gbq.schema.generate_bq_schema`. """ fields_sorted = sorted(fields, key=lambda field: field["name"]) @@ -42,8 +49,99 @@ def _clean_schema_fields(fields): return clean_schema +def schema_difference(schema_remote, schema_local): + """Calculates schemas difference and formats the output. + + Parameters + ---------- + schema_remote : dict + Schema for comparison. Each item of ``fields`` should have a 'name' + and a 'type' + schema_local : dict + Schema for comparison. Each item of ``fields`` should have a 'name' + and a 'type' + + Returns + ------- + str + Formatted schema difference output + """ + _schema_difference = _calculate_schema_difference( + schema_remote, schema_local + ) + return _format_schema_difference(_schema_difference) + + +def _format_schema_difference(schema_difference): + """Formats the schema difference. + + By default it shows only 3 differences. In case of more - it + says how many more are left. + + Parameters + ---------- + schema_difference : list[str] + List of differences between schemas. + + Returns + ------- + str + Formatted schema difference output + """ + if len(schema_difference) < 4: + diff_to_show = "\n".join(schema_difference) + else: + diffs_left = len(schema_difference) - 3 + schema_difference = schema_difference[:3] + if diffs_left != 0: + schema_difference.append("And {} more left.".format(diffs_left)) + diff_to_show = "\n".join(schema_difference) + return diff_to_show + + +def _calculate_schema_difference(schema_remote, schema_local): + """Calculates difference in dataframe and BigQuery schemas. + + Compares dataframe and BigQuery schemas to identify exact differences + in each field (field can be missing in the dataframe or field can have + a different type). + + Parameters + ---------- + schema_remote : dict + Schema for comparison. Each item of ``fields`` should have a 'name' + and a 'type' + schema_local : dict + Schema for comparison. Each item of ``fields`` should have a 'name' + and a 'type' + + Returns + ------- + List[str] + List of field differences + """ + fields_remote = _clean_schema_fields(schema_remote.get("fields", [])) + fields_local = _clean_schema_fields(schema_local.get("fields", [])) + diff = [] + for field_remote in fields_remote: + for field_local in fields_local: + if field_local["name"] == field_remote["name"]: + if field_local["type"] != field_remote["type"]: + diff.append( + DIFFERENT_FIELD_TYPE_TMPL.format( + field_local["name"], + field_local["type"], + field_remote["type"], + ) + ) + break + else: + diff.append(MISSING_FIELD_TMPL.format(field_remote["name"])) + return diff + + def schema_is_subset(schema_remote, schema_local): - """Indicate whether the schema to be uploaded is a subset + """Indicate whether the schema to be uploaded is a subset. Compare the BigQuery table identified in the parameters with the schema passed in and indicate whether a subset of the fields in diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index a11d2e6e..7b04d0e8 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -1048,7 +1048,7 @@ def test_upload_data_if_table_exists_append(self, project_id): assert result["num_rows"][0] == test_size * 2 # Try inserting with a different schema, confirm failure - with pytest.raises(gbq.InvalidSchema): + with pytest.raises(gbq.InvalidSchema) as excinfo: gbq.to_gbq( df_different_schema, self.destination_table + test_id, @@ -1056,6 +1056,16 @@ def test_upload_data_if_table_exists_append(self, project_id): if_exists="append", credentials=self.credentials, ) + expected_message = ( + "Please verify that the structure and " + "data types in the DataFrame match the " + "schema of the destination table.\n" + "Field 'bools': no such field in the dataframe.\n" + "Field 'flts': no such field in the dataframe.\n" + "Field 'ints': no such field in the dataframe.\n" + "And 2 more left." + ) + assert expected_message == str(excinfo.value) def test_upload_subset_columns_if_table_exists_append(self, project_id): # Issue 24: Upload is succesful if dataframe has columns diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 16286a2d..3c9f2b4f 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -69,6 +69,79 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test): assert not module_under_test.schema_is_subset(table_schema, tested_schema) +@pytest.mark.parametrize( + "original_fields,dataframe_fields,expected_difference", + [ + ( + [ + {"name": "A", "type": "FLOAT"}, + {"name": "B", "type": "FLOAT64"}, + {"name": "C", "type": "STRING"}, + ], + [ + {"name": "A", "type": "FLOAT64"}, + {"name": "B", "type": "FLOAT"}, + ], + "Field 'C': no such field in the dataframe.", + ), + ( + [ + {"name": "A", "type": "FLOAT"}, + {"name": "B", "type": "STRING"}, + ], + [ + {"name": "A", "type": "FLOAT64"}, + {"name": "B", "type": "FLOAT"}, + ], + "Field 'B' has different types: dataframe 'FLOAT', BigQuery 'STRING'.", + ), + ( + [ + {"name": "A", "type": "FLOAT"}, + {"name": "B", "type": "STRING"}, + {"name": "C", "type": "STRING"}, + ], + [ + {"name": "A", "type": "FLOAT64"}, + {"name": "B", "type": "FLOAT"}, + ], + ( + "Field 'B' has different types: dataframe 'FLOAT', BigQuery 'STRING'.\n" + "Field 'C': no such field in the dataframe." + ), + ), + ( + [ + {"name": "A", "type": "FLOAT"}, + {"name": "B", "type": "STRING"}, + {"name": "C", "type": "STRING"}, + {"name": "D", "type": "STRING"}, + {"name": "E", "type": "STRING"}, + ], + [ + {"name": "A", "type": "FLOAT64"}, + {"name": "B", "type": "FLOAT"}, + ], + ( + "Field 'B' has different types: dataframe 'FLOAT', BigQuery 'STRING'.\n" + "Field 'C': no such field in the dataframe.\n" + "Field 'D': no such field in the dataframe.\n" + "And 1 more left." + ), + ), + ], +) +def test_schema_difference( + module_under_test, original_fields, dataframe_fields, expected_difference +): + table_schema = {"fields": original_fields} + tested_schema = {"fields": dataframe_fields} + schema_difference = module_under_test.schema_difference( + table_schema, tested_schema + ) + assert expected_difference == schema_difference + + @pytest.mark.parametrize( "dataframe,expected_schema", [