diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 1b8e0fdb856b5..6d09e10f284af 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -19,19 +19,20 @@ objects contained with a :class:`Index`, :class:`Series`, or For some data types, pandas extends NumPy's type system. String aliases for these types can be found at :ref:`basics.dtypes`. -=================== ========================= ================== ============================= -Kind of Data pandas Data Type Scalar Array -=================== ========================= ================== ============================= -TZ-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :ref:`api.arrays.datetime` -Timedeltas (none) :class:`Timedelta` :ref:`api.arrays.timedelta` -Period (time spans) :class:`PeriodDtype` :class:`Period` :ref:`api.arrays.period` -Intervals :class:`IntervalDtype` :class:`Interval` :ref:`api.arrays.interval` -Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.arrays.integer_na` -Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical` -Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse` -Strings :class:`StringDtype` :class:`str` :ref:`api.arrays.string` -Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` -=================== ========================= ================== ============================= +=================== ========================= ============================= ============================= +Kind of Data pandas Data Type Scalar Array +=================== ========================= ============================= ============================= +TZ-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :ref:`api.arrays.datetime` +Timedeltas (none) :class:`Timedelta` :ref:`api.arrays.timedelta` +Period (time spans) :class:`PeriodDtype` :class:`Period` :ref:`api.arrays.period` +Intervals :class:`IntervalDtype` :class:`Interval` :ref:`api.arrays.interval` +Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.arrays.integer_na` +Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical` +Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse` +Strings :class:`StringDtype` :class:`str` :ref:`api.arrays.string` +Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` +PyArrow :class:`ArrowDtype` Python Scalars or :class:`NA` :ref:`api.arrays.arrow` +=================== ========================= ============================= ============================= pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). The top-level :meth:`array` method can be used to create a new array, which may be @@ -42,6 +43,44 @@ stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFra array +.. _api.arrays.arrow: + +PyArrow +------- + +.. warning:: + + This feature is experimental, and the API can change in a future release without warning. + +The :class:`arrays.ArrowExtensionArray` is backed by a :external+pyarrow:py:class:`pyarrow.ChunkedArray` with a +:external+pyarrow:py:class:`pyarrow.DataType` instead of a NumPy array and data type. The ``.dtype`` of a :class:`arrays.ArrowExtensionArray` +is an :class:`ArrowDtype`. + +`Pyarrow `__ provides similar array and `data type `__ +support as NumPy including first-class nullability support for all data types, immutability and more. + +.. note:: + + For string types (``pyarrow.string()``, ``string[pyarrow]``), PyArrow support is still facilitated + by :class:`arrays.ArrowStringArray` and ``StringDtype("pyarrow")``. See the :ref:`string section ` + below. + +While individual values in an :class:`arrays.ArrowExtensionArray` are stored as a PyArrow objects, scalars are **returned** +as Python scalars corresponding to the data type, e.g. a PyArrow int64 will be returned as Python int, or :class:`NA` for missing +values. + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + arrays.ArrowExtensionArray + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + ArrowDtype + .. _api.arrays.datetime: Datetimes diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index bcbe2c6d8b104..f0eb73edda332 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -24,6 +24,40 @@ https://github.com/pandas-dev/pandas-stubs for more information. We thank VirtusLab and Microsoft for their initial, significant contributions to ``pandas-stubs`` +.. _whatsnew_150.enhancements.arrow: + +Native PyArrow-backed ExtensionArray +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +With `Pyarrow `__ installed, users can now create pandas objects +that are backed by a ``pyarrow.ChunkedArray`` and ``pyarrow.DataType``. + +The ``dtype`` argument can accept a string of a `pyarrow data type `__ +with ``pyarrow`` in brackets e.g. ``"int64[pyarrow]"`` or, for pyarrow data types that take parameters, a :class:`ArrowDtype` +initialized with a ``pyarrow.DataType``. + +.. ipython:: python + + import pyarrow as pa + ser_float = pd.Series([1.0, 2.0, None], dtype="float32[pyarrow]") + ser_float + + list_of_int_type = pd.ArrowDtype(pa.list_(pa.int64())) + ser_list = pd.Series([[1, 2], [3, None]], dtype=list_of_int_type) + ser_list + + ser_list.take([1, 0]) + ser_float * 5 + ser_float.mean() + ser_float.dropna() + +Most operations are supported and have been implemented using `pyarrow compute `__ functions. +We recommend installing the latest version of PyArrow to access the most recently implemented compute functions. + +.. warning:: + + This feature is experimental, and the API can change in a future release without warning. + .. _whatsnew_150.enhancements.dataframe_interchange: DataFrame interchange protocol implementation diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 89d362eb77e68..3a8e80a6b5d2b 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,6 +4,7 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( + ArrowExtensionArray, ArrowStringArray, BooleanArray, Categorical, @@ -19,6 +20,7 @@ ) __all__ = [ + "ArrowExtensionArray", "ArrowStringArray", "BooleanArray", "Categorical", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8d80cb18d21db..1f7939011a1f1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -159,8 +159,47 @@ def to_pyarrow_type( class ArrowExtensionArray(OpsMixin, ExtensionArray): """ - Base class for ExtensionArray backed by Arrow ChunkedArray. - """ + Pandas ExtensionArray backed by a PyArrow ChunkedArray. + + .. warning:: + + ArrowExtensionArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + ArrowExtensionArray + + Notes + ----- + Most methods are implemented using `pyarrow compute functions. `__ + Some methods may either raise an exception or raise a ``PerformanceWarning`` if an + associated compute function is not available based on the installed version of PyArrow. + + Please install the latest version of PyArrow to enable the best functionality and avoid + potential bugs in prior versions of PyArrow. + + Examples + -------- + Create an ArrowExtensionArray with :func:`pandas.array`: + + >>> pd.array([1, 1, None], dtype="int64[pyarrow]") + + [1, 1, ] + Length: 3, dtype: int64[pyarrow] + """ # noqa: E501 (http link too long) _data: pa.ChunkedArray _dtype: ArrowDtype diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 523e031c220e4..48e2c5bdda2f8 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -20,9 +20,47 @@ @register_extension_dtype class ArrowDtype(StorageExtensionDtype): """ - Base class for dtypes for ArrowExtensionArray. - Modeled after BaseMaskedDtype - """ + An ExtensionDtype for PyArrow data types. + + .. warning:: + + ArrowDtype is considered experimental. The implementation and + parts of the API may change without warning. + + While most ``dtype`` arguments can accept the "string" + constructor, e.g. ``"int64[pyarrow]"``, ArrowDtype is useful + if the data type contains parameters like ``pyarrow.timestamp``. + + Parameters + ---------- + pyarrow_dtype : pa.DataType + An instance of a `pyarrow.DataType `__. + + Attributes + ---------- + pyarrow_dtype + + Methods + ------- + None + + Returns + ------- + ArrowDtype + + Examples + -------- + >>> import pyarrow as pa + >>> pd.ArrowDtype(pa.int64()) + int64[pyarrow] + + Types with parameters must be constructed with ArrowDtype. + + >>> pd.ArrowDtype(pa.timestamp("s", tz="America/New_York")) + timestamp[s, tz=America/New_York][pyarrow] + >>> pd.ArrowDtype(pa.list_(pa.int64())) + list[pyarrow] + """ # noqa: E501 _metadata = ("storage", "pyarrow_dtype") # type: ignore[assignment] @@ -37,6 +75,9 @@ def __init__(self, pyarrow_dtype: pa.DataType) -> None: ) self.pyarrow_dtype = pyarrow_dtype + def __repr__(self) -> str: + return self.name + @property def type(self): """ diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 9aca47dbddbf2..e7233484e16b6 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -150,6 +150,7 @@ "LZMA", "Numba", "Timestamp", + "PyArrow", } CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS}