Skip to content

BUG: json_normalize not consistently ignoring errors (#41876) #42179

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 15, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ MultiIndex
I/O
^^^
- Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
- Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`)
- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
-

Expand Down
42 changes: 25 additions & 17 deletions pandas/io/json/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,14 +380,31 @@ def _json_normalize(
Returns normalized data with columns prefixed with the given string.
"""

def _pull_field(js: dict[str, Any], spec: list | str) -> Scalar | Iterable:
def _pull_field(
js: dict[str, Any], spec: list | str, extract_record: bool = False
) -> Scalar | Iterable:
"""Internal function to pull field"""
result = js
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]
try:
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]
except KeyError as e:
if extract_record:
raise KeyError(
f"Key {e} not found. If specifying a record_path, all elements of "
f"data should have the path."
) from e
elif errors == "ignore":
return np.nan
else:
raise KeyError(
f"Key {e} not found. To replace missing values of {e} with "
f"np.nan, pass in errors='ignore'"
) from e

return result

def _pull_records(js: dict[str, Any], spec: list | str) -> list:
Expand All @@ -396,7 +413,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
_pull_field, but require to return list. And will raise error
if has non iterable value.
"""
result = _pull_field(js, spec)
result = _pull_field(js, spec, extract_record=True)

# GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
# null, otherwise return an empty list
Expand Down Expand Up @@ -488,16 +505,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
try:
meta_val = _pull_field(obj, val[level:])
except KeyError as e:
if errors == "ignore":
meta_val = np.nan
else:
raise KeyError(
"Try running with errors='ignore' as key "
f"{e} is not always present"
) from e
meta_val = _pull_field(obj, val[level:])
meta_vals[key].append(meta_val)
records.extend(recs)

Expand Down
44 changes: 41 additions & 3 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def missing_metadata():
"zip": 44646,
}
],
"previous_residences": {"cities": [{"city_name": "Foo York City"}]},
},
{
"addresses": [
Expand All @@ -115,7 +116,8 @@ def missing_metadata():
"state": "TN",
"zip": 37643,
}
]
],
"previous_residences": {"cities": [{"city_name": "Barmingham"}]},
},
]

Expand Down Expand Up @@ -598,7 +600,10 @@ def test_json_normalize_errors(self, missing_metadata):
# If meta keys are not always present a new option to set
# errors='ignore' has been implemented

msg = "Try running with errors='ignore' as key 'name' is not always present"
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
Expand All @@ -618,11 +623,44 @@ def test_missing_meta(self, missing_metadata):
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
]
columns = ["city", "number", "state", "street", "zip", "name"]
columns = ["number", "street", "city", "state", "zip", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)

def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
# GH41876
# Ensure errors='raise' works as intended even when a record_path of length
# greater than one is passed in
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="raise",
)

def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
# GH41876
# Ensure errors='ignore' works as intended even when a record_path of length
# greater than one is passed in
result = json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="ignore",
)
ex_data = [
["Foo York City", "Alice"],
["Barmingham", np.nan],
]
columns = ["city_name", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)

def test_donot_drop_nonevalues(self):
# GH21356
data = [
Expand Down