Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: show schemas difference when throwing InvalidSchema exception #350

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ Features
- Load DataFrame with ``to_gbq`` to a table in a project different from the API
client project. Specify the target table ID as ``project.dataset.table`` to
use this feature. (:issue:`321`, :issue:`347`)
- In case of dataframe and BigQuery schemas difference in ``append`` mode (
missing field or wrong type field) - show it in the exception. (:issue:`349`)

Dependencies
~~~~~~~~~~~~
Expand Down
8 changes: 6 additions & 2 deletions pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -1191,11 +1191,15 @@ def to_gbq(
if not pandas_gbq.schema.schema_is_subset(
original_schema, table_schema
):
raise InvalidSchema(
schema_difference = pandas_gbq.schema.schema_difference(
original_schema, table_schema
)
exception_message = (
"Please verify that the structure and "
"data types in the DataFrame match the "
"schema of the destination table."
"schema of the destination table.\n" + schema_difference
)
raise InvalidSchema(exception_message)

# Update the local `table_schema` so mode matches.
# See: https://github.com/pydata/pandas-gbq/issues/315
Expand Down
102 changes: 100 additions & 2 deletions pandas_gbq/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
import copy


# String templates used in schemas comparison.
MISSING_FIELD_TMPL = "Field '{}': no such field in the dataframe."
DIFFERENT_FIELD_TYPE_TMPL = (
"Field '{}' has different types: dataframe '{}', BigQuery '{}'."
)


# API may return data types as legacy SQL, so maintain a mapping of aliases
# from standard SQL to legacy data types.
_TYPE_ALIASES = {
Expand Down Expand Up @@ -30,7 +37,7 @@ def to_pandas_gbq(client_schema):
def _clean_schema_fields(fields):
"""Return a sanitized version of the schema for comparisons.

The ``mode`` and ``description`` properties areis ignored because they
The ``mode`` and ``description`` properties are ignored because they
are not generated by func:`pandas_gbq.schema.generate_bq_schema`.
"""
fields_sorted = sorted(fields, key=lambda field: field["name"])
Expand All @@ -42,8 +49,99 @@ def _clean_schema_fields(fields):
return clean_schema


def schema_difference(schema_remote, schema_local):
"""Calculates schemas difference and formats the output.

Parameters
----------
schema_remote : dict
Schema for comparison. Each item of ``fields`` should have a 'name'
and a 'type'
schema_local : dict
Schema for comparison. Each item of ``fields`` should have a 'name'
and a 'type'

Returns
-------
str
Formatted schema difference output
"""
_schema_difference = _calculate_schema_difference(
schema_remote, schema_local
)
return _format_schema_difference(_schema_difference)


def _format_schema_difference(schema_difference):
"""Formats the schema difference.

By default it shows only 3 differences. In case of more - it
says how many more are left.

Parameters
----------
schema_difference : list[str]
List of differences between schemas.

Returns
-------
str
Formatted schema difference output
"""
if len(schema_difference) < 4:
diff_to_show = "\n".join(schema_difference)
else:
diffs_left = len(schema_difference) - 3
schema_difference = schema_difference[:3]
if diffs_left != 0:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this always evaluate to True? Might be throwing off our branch test-coverage numbers if so.

schema_difference.append("And {} more left.".format(diffs_left))
diff_to_show = "\n".join(schema_difference)
Comment on lines +91 to +98
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Since the diff_to_show = "\n".join(schema_difference) is duplicated, we can refactor this a bit.

Suggested change
if len(schema_difference) < 4:
diff_to_show = "\n".join(schema_difference)
else:
diffs_left = len(schema_difference) - 3
schema_difference = schema_difference[:3]
if diffs_left != 0:
schema_difference.append("And {} more left.".format(diffs_left))
diff_to_show = "\n".join(schema_difference)
if len(schema_difference) > 3:
diffs_left = len(schema_difference) - 3
schema_difference = schema_difference[:3]
schema_difference.append("And {} more left.".format(diffs_left))
diff_to_show = "\n".join(schema_difference)

return diff_to_show


def _calculate_schema_difference(schema_remote, schema_local):
"""Calculates difference in dataframe and BigQuery schemas.

Compares dataframe and BigQuery schemas to identify exact differences
in each field (field can be missing in the dataframe or field can have
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing fields in the dataframe should be OK, right? It's extra fields in the dataframe that can be a problem (unless we allow field addition, as requested here: #107)

a different type).

Parameters
----------
schema_remote : dict
Schema for comparison. Each item of ``fields`` should have a 'name'
and a 'type'
schema_local : dict
Schema for comparison. Each item of ``fields`` should have a 'name'
and a 'type'

Returns
-------
List[str]
List of field differences
"""
fields_remote = _clean_schema_fields(schema_remote.get("fields", []))
fields_local = _clean_schema_fields(schema_local.get("fields", []))
diff = []
for field_remote in fields_remote:
for field_local in fields_local:
if field_local["name"] == field_remote["name"]:
if field_local["type"] != field_remote["type"]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might miss type mismatches if the order of the columns is different in either field_remote or field_local (or does _clean_schema_fields sort them?). I think we'd want:

  1. Loop through only fields_local (dataframe)
  2. Check if the field name isn't in fields_remote (table)
  3. Check if the field types don't match

diff.append(
DIFFERENT_FIELD_TYPE_TMPL.format(
field_local["name"],
field_local["type"],
field_remote["type"],
)
)
break
else:
diff.append(MISSING_FIELD_TMPL.format(field_remote["name"]))
return diff


def schema_is_subset(schema_remote, schema_local):
"""Indicate whether the schema to be uploaded is a subset
"""Indicate whether the schema to be uploaded is a subset.

Compare the BigQuery table identified in the parameters with
the schema passed in and indicate whether a subset of the fields in
Expand Down
12 changes: 11 additions & 1 deletion tests/system/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -1048,14 +1048,24 @@ def test_upload_data_if_table_exists_append(self, project_id):
assert result["num_rows"][0] == test_size * 2

# Try inserting with a different schema, confirm failure
with pytest.raises(gbq.InvalidSchema):
with pytest.raises(gbq.InvalidSchema) as excinfo:
gbq.to_gbq(
df_different_schema,
self.destination_table + test_id,
project_id,
if_exists="append",
credentials=self.credentials,
)
expected_message = (
"Please verify that the structure and "
"data types in the DataFrame match the "
"schema of the destination table.\n"
"Field 'bools': no such field in the dataframe.\n"
"Field 'flts': no such field in the dataframe.\n"
"Field 'ints': no such field in the dataframe.\n"
"And 2 more left."
)
assert expected_message == str(excinfo.value)

def test_upload_subset_columns_if_table_exists_append(self, project_id):
# Issue 24: Upload is succesful if dataframe has columns
Expand Down
73 changes: 73 additions & 0 deletions tests/unit/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,79 @@ def test_schema_is_subset_fails_if_not_subset(module_under_test):
assert not module_under_test.schema_is_subset(table_schema, tested_schema)


@pytest.mark.parametrize(
"original_fields,dataframe_fields,expected_difference",
[
(
[
{"name": "A", "type": "FLOAT"},
{"name": "B", "type": "FLOAT64"},
{"name": "C", "type": "STRING"},
],
[
{"name": "A", "type": "FLOAT64"},
{"name": "B", "type": "FLOAT"},
],
"Field 'C': no such field in the dataframe.",
),
(
[
{"name": "A", "type": "FLOAT"},
{"name": "B", "type": "STRING"},
],
[
{"name": "A", "type": "FLOAT64"},
{"name": "B", "type": "FLOAT"},
],
"Field 'B' has different types: dataframe 'FLOAT', BigQuery 'STRING'.",
),
(
[
{"name": "A", "type": "FLOAT"},
{"name": "B", "type": "STRING"},
{"name": "C", "type": "STRING"},
],
[
{"name": "A", "type": "FLOAT64"},
{"name": "B", "type": "FLOAT"},
],
(
"Field 'B' has different types: dataframe 'FLOAT', BigQuery 'STRING'.\n"
"Field 'C': no such field in the dataframe."
),
),
(
[
{"name": "A", "type": "FLOAT"},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we'll want some tests where the order of the columns doesn't line up. That way we can be sure we aren't showing errors that aren't actually errors.

{"name": "B", "type": "STRING"},
{"name": "C", "type": "STRING"},
{"name": "D", "type": "STRING"},
{"name": "E", "type": "STRING"},
],
[
{"name": "A", "type": "FLOAT64"},
{"name": "B", "type": "FLOAT"},
],
(
"Field 'B' has different types: dataframe 'FLOAT', BigQuery 'STRING'.\n"
"Field 'C': no such field in the dataframe.\n"
"Field 'D': no such field in the dataframe.\n"
"And 1 more left."
),
),
],
)
def test_schema_difference(
module_under_test, original_fields, dataframe_fields, expected_difference
):
table_schema = {"fields": original_fields}
tested_schema = {"fields": dataframe_fields}
schema_difference = module_under_test.schema_difference(
table_schema, tested_schema
)
assert expected_difference == schema_difference


@pytest.mark.parametrize(
"dataframe,expected_schema",
[
Expand Down