From c2bb2aafa3fd1f3320ad76737c7d731b2f02bab9 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Thu, 29 Aug 2024 10:26:19 -0700 Subject: [PATCH 01/15] expose to_pandas_kwargs in pyarrow engine --- pandas/io/parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 24415299e799b..2b592736fcb9f 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -245,11 +245,12 @@ def read( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs=None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True - to_pandas_kwargs = {} + to_pandas_kwargs = to_pandas_kwargs if to_pandas_kwargs is not None else {} if dtype_backend == "numpy_nullable": from pandas.io._util import _arrow_dtype_mapping From 2a36913a60ea4eb399bcc121e0d612d0f4a4e4f5 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Thu, 29 Aug 2024 15:39:39 -0700 Subject: [PATCH 02/15] add test for roundtripping maps --- pandas/tests/io/test_parquet.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a29e479b7c9f1..988f8a245358d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1173,6 +1173,19 @@ def test_non_nanosecond_timestamps(self, temp_file): ) tm.assert_frame_equal(result, expected) + def test_maps_as_pydicts(self, pa): + import pyarrow + + schema = pyarrow.schema( + [("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))] + ) + df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}]) + check_round_trip( + df, + pa, + write_kwargs={"schema": schema}, + ) + class TestParquetFastParquet(Base): @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") From 0d6cd16a7d96b438fb3780d6e056db535dd3878e Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Thu, 29 Aug 2024 15:42:51 -0700 Subject: [PATCH 03/15] make test pass by using maps_as_pydicts --- pandas/tests/io/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 988f8a245358d..fc73d51799809 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1184,6 +1184,7 @@ def test_maps_as_pydicts(self, pa): df, pa, write_kwargs={"schema": schema}, + read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}}, ) From 908614d0eabe5f22cb2442b8bc13d4d7eccc796b Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Thu, 29 Aug 2024 16:54:31 -0700 Subject: [PATCH 04/15] remove unused type ignore --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 2b592736fcb9f..66a958a835fdd 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -257,7 +257,7 @@ def read( mapping = _arrow_dtype_mapping() to_pandas_kwargs["types_mapper"] = mapping.get elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] + to_pandas_kwargs["types_mapper"] = pd.ArrowDtype elif using_string_dtype(): to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() From 29675cdf4fd80322ed2285563c56d2b19fa613c0 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Thu, 29 Aug 2024 17:41:01 -0700 Subject: [PATCH 05/15] skip test if pyarrow is too old --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fc73d51799809..cb88e8a08e539 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1174,7 +1174,7 @@ def test_non_nanosecond_timestamps(self, temp_file): tm.assert_frame_equal(result, expected) def test_maps_as_pydicts(self, pa): - import pyarrow + pyarrow = pytest.importorskip("pyarrow", "13.0.0") schema = pyarrow.schema( [("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))] From 6d7accf975145c3fd33dfb121a86fb4fa6625a73 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Fri, 30 Aug 2024 16:08:04 -0700 Subject: [PATCH 06/15] update whatsnew --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 528226502da33..3f5527c5dcc8b 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -127,7 +127,7 @@ MultiIndex I/O ^^^ -- +- ``pyarrow`` engine for :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas`. This enables passing in ``maps_as_pydicts`` to read parquet map datatypes as python dictionaries. (:issue:`56842`) - Period From 61dd637a56a7164a50378be9f9763509ec6970a8 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 18 Sep 2024 15:19:34 -0700 Subject: [PATCH 07/15] Apply suggestions from code review Co-authored-by: Xiao Yuan --- pandas/io/parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 66a958a835fdd..b4c45cb674b87 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -245,12 +245,12 @@ def read( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, storage_options: StorageOptions | None = None, filesystem=None, - to_pandas_kwargs=None, + to_pandas_kwargs: dict[str, Any] | None = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True - to_pandas_kwargs = to_pandas_kwargs if to_pandas_kwargs is not None else {} + to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs if dtype_backend == "numpy_nullable": from pandas.io._util import _arrow_dtype_mapping From 67ecfb5f02c629c25034a465783c63c56ef2e237 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Mon, 18 Nov 2024 15:43:43 -0800 Subject: [PATCH 08/15] move to v3.0.0 whatsnew --- doc/source/whatsnew/v2.3.0.rst | 2 +- doc/source/whatsnew/v3.0.0.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 3f5527c5dcc8b..528226502da33 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -127,7 +127,7 @@ MultiIndex I/O ^^^ -- ``pyarrow`` engine for :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas`. This enables passing in ``maps_as_pydicts`` to read parquet map datatypes as python dictionaries. (:issue:`56842`) +- - Period diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index da0d85b7bb529..74aadf6ece517 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -628,6 +628,7 @@ I/O - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) +- ``pyarrow`` engine for :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas`. This enables passing in ``maps_as_pydicts`` to read parquet map datatypes as python dictionaries. (:issue:`56842`) Period ^^^^^^ From 937b29f42f0ebe828fb326d689f8ee90db01ad47 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Mon, 18 Nov 2024 15:51:26 -0800 Subject: [PATCH 09/15] add to docstring --- pandas/io/parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index b4c45cb674b87..03f756e13e3f0 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -467,7 +467,9 @@ def to_parquet( .. versionadded:: 2.1.0 kwargs - Additional keyword arguments passed to the engine + Additional keyword arguments passed to the engine. + When using the ``'pyarrow'`` engine ``to_pandas_kwargs`` can be used to pass + through arguments to ``pyarrow.Table.to_pandas``. Returns ------- From 15ed566fdd0a2d4816e2d20fba2b922123e5baf5 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Tue, 19 Nov 2024 08:59:49 -0800 Subject: [PATCH 10/15] try to make mypy happy --- pandas/io/_util.py | 6 ++++-- pandas/io/parquet.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 748205c088acf..9778a404e23e0 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -60,10 +60,12 @@ def arrow_table_to_pandas( table: pyarrow.Table, dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, null_to_int64: bool = False, - **kwargs, + to_pandas_kwargs: dict | None = None, ) -> pd.DataFrame: pa = import_optional_dependency("pyarrow") + to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs + types_mapper: type[pd.ArrowDtype] | None | Callable if dtype_backend == "numpy_nullable": mapping = _arrow_dtype_mapping() @@ -81,5 +83,5 @@ def arrow_table_to_pandas( else: raise NotImplementedError - df = table.to_pandas(types_mapper=types_mapper, **kwargs) + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) return df diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index b4c043be36946..62eaf4b859aea 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -247,7 +247,6 @@ def read( ) -> DataFrame: kwargs["use_pandas_metadata"] = True - to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -269,7 +268,9 @@ def read( DeprecationWarning, ) result = arrow_table_to_pandas( - pa_table, dtype_backend=dtype_backend, **to_pandas_kwargs + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, ) if pa_table.schema.metadata: From a13f0545a4504d166b9c63a2d4ed55a89739bd85 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Tue, 19 Nov 2024 17:38:56 -0800 Subject: [PATCH 11/15] Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c00e219d7d0d8..2d1467dda1548 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,7 +54,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) -- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing in ``maps_as_pydicts`` to read parquet map datatypes as python dictionaries (:issue:`56842`) +- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) From a092ede9d577be936d9140d5dcb831d666fc645b Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Tue, 19 Nov 2024 17:47:11 -0800 Subject: [PATCH 12/15] make to_pandas_kwargs explicit parameter and update docstring --- pandas/io/parquet.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 62eaf4b859aea..03df750a874c4 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -458,8 +458,6 @@ def to_parquet( kwargs Additional keyword arguments passed to the engine. - When using the ``'pyarrow'`` engine ``to_pandas_kwargs`` can be used to pass - through arguments to ``pyarrow.Table.to_pandas``. Returns ------- @@ -498,6 +496,7 @@ def read_parquet( dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, filesystem: Any = None, filters: list[tuple] | list[list[tuple]] | None = None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: """ @@ -571,6 +570,12 @@ def read_parquet( .. versionadded:: 2.1.0 + to_pandas_kwargs: dict | None, default None + keyword arguments to pass through to ``pyarrow.Table.to_pandas`` + when ``engine="pyarrow"``. + + .. versionadded:: 3.0.0 + **kwargs Any additional kwargs are passed to the engine. @@ -643,5 +648,6 @@ def read_parquet( storage_options=storage_options, dtype_backend=dtype_backend, filesystem=filesystem, + to_pandas_kwargs=to_pandas_kwargs, **kwargs, ) From 1c0cd02c24ebac6ac150aacc9b706651d7c5e25f Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Tue, 19 Nov 2024 21:06:07 -0800 Subject: [PATCH 13/15] fix FastParquetImpl --- pandas/io/parquet.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 03df750a874c4..cc3e9be283fa8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -352,6 +352,7 @@ def read( filters=None, storage_options: StorageOptions | None = None, filesystem=None, + to_pandas_kwargs: dict | None = None, **kwargs, ) -> DataFrame: parquet_kwargs: dict[str, Any] = {} @@ -367,6 +368,10 @@ def read( raise NotImplementedError( "filesystem is not implemented for the fastparquet engine." ) + if to_pandas_kwargs is not None: + raise NotImplementedError( + "to_pandas_kwargs is not implemented for the fastparquet engine." + ) path = stringify_path(path) handles = None if is_fsspec_url(path): From dac237ac3de5679f0dea901627efca85d5502f89 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 20 Nov 2024 08:03:20 -0800 Subject: [PATCH 14/15] try to fix docstring --- pandas/io/parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index cc3e9be283fa8..eafa3d990e532 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -575,8 +575,8 @@ def read_parquet( .. versionadded:: 2.1.0 - to_pandas_kwargs: dict | None, default None - keyword arguments to pass through to ``pyarrow.Table.to_pandas`` + to_pandas_kwargs : dict | None, default None + Keyword arguments to pass through to ``pyarrow.Table.to_pandas`` when ``engine="pyarrow"``. .. versionadded:: 3.0.0 From d4f55a1da2896e339344e2e3efbc74c63628e3d2 Mon Sep 17 00:00:00 2001 From: Joseph Kleinhenz Date: Wed, 20 Nov 2024 15:54:55 -0800 Subject: [PATCH 15/15] Update pandas/io/parquet.py Co-authored-by: Joris Van den Bossche --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index eafa3d990e532..6a5a83088e986 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -576,7 +576,7 @@ def read_parquet( .. versionadded:: 2.1.0 to_pandas_kwargs : dict | None, default None - Keyword arguments to pass through to ``pyarrow.Table.to_pandas`` + Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas` when ``engine="pyarrow"``. .. versionadded:: 3.0.0