diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index acae6e111b7ca..f233509215f99 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -233,6 +233,7 @@ MultiIndex I/O ^^^ - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) +- Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) - diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 5927d6482d3b0..729d60ca78944 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -380,14 +380,31 @@ def _json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field(js: dict[str, Any], spec: list | str) -> Scalar | Iterable: + def _pull_field( + js: dict[str, Any], spec: list | str, extract_record: bool = False + ) -> Scalar | Iterable: """Internal function to pull field""" result = js - if isinstance(spec, list): - for field in spec: - result = result[field] - else: - result = result[spec] + try: + if isinstance(spec, list): + for field in spec: + result = result[field] + else: + result = result[spec] + except KeyError as e: + if extract_record: + raise KeyError( + f"Key {e} not found. If specifying a record_path, all elements of " + f"data should have the path." + ) from e + elif errors == "ignore": + return np.nan + else: + raise KeyError( + f"Key {e} not found. To replace missing values of {e} with " + f"np.nan, pass in errors='ignore'" + ) from e + return result def _pull_records(js: dict[str, Any], spec: list | str) -> list: @@ -396,7 +413,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list: _pull_field, but require to return list. And will raise error if has non iterable value. """ - result = _pull_field(js, spec) + result = _pull_field(js, spec, extract_record=True) # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not # null, otherwise return an empty list @@ -488,16 +505,7 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 > len(val): meta_val = seen_meta[key] else: - try: - meta_val = _pull_field(obj, val[level:]) - except KeyError as e: - if errors == "ignore": - meta_val = np.nan - else: - raise KeyError( - "Try running with errors='ignore' as key " - f"{e} is not always present" - ) from e + meta_val = _pull_field(obj, val[level:]) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a428d8c71a793..faf9fc903d7b5 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -105,6 +105,7 @@ def missing_metadata(): "zip": 44646, } ], + "previous_residences": {"cities": [{"city_name": "Foo York City"}]}, }, { "addresses": [ @@ -115,7 +116,8 @@ def missing_metadata(): "state": "TN", "zip": 37643, } - ] + ], + "previous_residences": {"cities": [{"city_name": "Barmingham"}]}, }, ] @@ -598,7 +600,10 @@ def test_json_normalize_errors(self, missing_metadata): # If meta keys are not always present a new option to set # errors='ignore' has been implemented - msg = "Try running with errors='ignore' as key 'name' is not always present" + msg = ( + "Key 'name' not found. To replace missing values of " + "'name' with np.nan, pass in errors='ignore'" + ) with pytest.raises(KeyError, match=msg): json_normalize( data=missing_metadata, @@ -618,11 +623,44 @@ def test_missing_meta(self, missing_metadata): [9562, "Morris St.", "Massillon", "OH", 44646, "Alice"], [8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan], ] - columns = ["city", "number", "state", "street", "zip", "name"] columns = ["number", "street", "city", "state", "zip", "name"] expected = DataFrame(ex_data, columns=columns) tm.assert_frame_equal(result, expected) + def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata): + # GH41876 + # Ensure errors='raise' works as intended even when a record_path of length + # greater than one is passed in + msg = ( + "Key 'name' not found. To replace missing values of " + "'name' with np.nan, pass in errors='ignore'" + ) + with pytest.raises(KeyError, match=msg): + json_normalize( + data=missing_metadata, + record_path=["previous_residences", "cities"], + meta="name", + errors="raise", + ) + + def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata): + # GH41876 + # Ensure errors='ignore' works as intended even when a record_path of length + # greater than one is passed in + result = json_normalize( + data=missing_metadata, + record_path=["previous_residences", "cities"], + meta="name", + errors="ignore", + ) + ex_data = [ + ["Foo York City", "Alice"], + ["Barmingham", np.nan], + ] + columns = ["city_name", "name"] + expected = DataFrame(ex_data, columns=columns) + tm.assert_frame_equal(result, expected) + def test_donot_drop_nonevalues(self): # GH21356 data = [