Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-3829: [Python] add __arrow_array__ protocol to support third-party array classes in conversion to Arrow #5106

Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions docs/source/python/extending_types.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
.. Licensed to the Apache Software Foundation (ASF) under one
.. or more contributor license agreements. See the NOTICE file
.. distributed with this work for additional information
.. regarding copyright ownership. The ASF licenses this file
.. to you under the Apache License, Version 2.0 (the
.. "License"); you may not use this file except in compliance
.. with the License. You may obtain a copy of the License at

.. http://www.apache.org/licenses/LICENSE-2.0

.. Unless required by applicable law or agreed to in writing,
.. software distributed under the License is distributed on an
.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
.. KIND, either express or implied. See the License for the
.. specific language governing permissions and limitations
.. under the License.

.. currentmodule:: pyarrow
.. _extending_types:

Extending pyarrow
=================

Controlling conversion to pyarrow.Array with the ``__arrow_array__`` protocol
-----------------------------------------------------------------------------

The :func:`pyarrow.array` function has built-in support for Python sequences,
numpy arrays and pandas 1D objects (Series, Index, Categorical, ..) to convert
those to Arrow arrays. This can be extended for other array-like objects
by implementing the ``__arrow_array__`` method (similar to numpy's ``__array__``
protocol).

For example, to support conversion of your duck array class to an Arrow array,
define the ``__arrow_array__`` method to return an Arrow array::

class MyDuckArray:

...

def __arrow_array__(self, type=None):
# convert the underlying array values to a pyarrow Array
import pyarrow
return pyarrow.array(..., type=type)

The ``__arrow_array__`` method takes an optional `type` keyword which is passed
through from :func:`pyarrow.array`.
1 change: 1 addition & 0 deletions docs/source/python/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ files into Arrow structures.
json
parquet
cuda
extending_types
extending
api
getting_involved
Expand Down
20 changes: 18 additions & 2 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,18 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
return pyarrow_wrap_array(chunked_out.get().chunk(0))


def _handle_arrow_array_protocol(obj, type, mask, size):
if mask is not None or size is not None:
raise ValueError(
"Cannot specify a mask or a size when passing an object that is "
"converted with the __arrow_array__ protocol.")
res = obj.__arrow_array__(type=type)
if not isinstance(res, Array):
raise ValueError("The object's __arrow_array__ method does not "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TypeError, no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Numpy returns a ValueError in this case, but yes, a TypeError seems more appropriate

"return a pyarrow Array.")
return res


def array(object obj, type=None, mask=None, size=None, from_pandas=None,
bint safe=True, MemoryPool memory_pool=None):
"""
Expand Down Expand Up @@ -161,7 +173,9 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
else:
c_from_pandas = from_pandas

if _is_array_like(obj):
if hasattr(obj, '__arrow_array__'):
return _handle_arrow_array_protocol(obj, type, mask, size)
elif _is_array_like(obj):
if mask is not None:
# out argument unused
mask = get_series_values(mask, &is_pandas_object)
Expand All @@ -178,7 +192,9 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
mask = values.mask
values = values.data

if pandas_api.is_categorical(values):
if hasattr(values, '__arrow_array__'):
return _handle_arrow_array_protocol(values, type, mask, size)
elif pandas_api.is_categorical(values):
return DictionaryArray.from_arrays(
values.codes, values.categories.values,
mask=mask, ordered=values.ordered,
Expand Down
21 changes: 14 additions & 7 deletions python/pyarrow/pandas-shim.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ cdef class _PandasAPIShim(object):
object _loose_version, _version
object _pd, _types_api, _compat_module
object _data_frame, _index, _series, _categorical_type
object _datetimetz_type
object _datetimetz_type, _extension_array
object _array_like_types

def __init__(self):
Expand All @@ -49,18 +49,25 @@ cdef class _PandasAPIShim(object):
return

self._pd = pd
self._version = pd.__version__
from distutils.version import LooseVersion
self._loose_version = LooseVersion(pd.__version__)

self._compat_module = pdcompat
self._data_frame = pd.DataFrame
self._index = pd.Index
self._categorical_type = pd.Categorical
self._series = pd.Series
if self._loose_version >= LooseVersion('0.23.0'):
self._extension_array = pd.api.extensions.ExtensionArray
self._array_like_types = (
self._series, self._index, self._categorical_type,
self._extension_array)
else:
self._extension_array = None
self._array_like_types = (
self._series, self._index, self._categorical_type)

self._array_like_types = (self._series, self._index,
self._categorical_type)

self._version = pd.__version__
from distutils.version import LooseVersion
self._loose_version = LooseVersion(pd.__version__)
if self._loose_version >= LooseVersion('0.20.0'):
from pandas.api.types import DatetimeTZDtype
self._types_api = pd.api.types
Expand Down
39 changes: 39 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1565,6 +1565,45 @@ def test_array_from_large_pyints():
pa.array([int(2 ** 63)])


def test_array_protocol():

class MyArray:
def __init__(self, data):
self.data = data

def __arrow_array__(self, type=None):
return pa.array(self.data, type=type)

arr = MyArray(np.array([1, 2, 3], dtype='int64'))
result = pa.array(arr)
expected = pa.array([1, 2, 3], type=pa.int64())
assert result.equals(expected)
result = pa.array(arr, type=pa.int64())
expected = pa.array([1, 2, 3], type=pa.int64())
assert result.equals(expected)
result = pa.array(arr, type=pa.float64())
expected = pa.array([1, 2, 3], type=pa.float64())
assert result.equals(expected)

# raise error when passing size or mask keywords
with pytest.raises(ValueError):
pa.array(arr, mask=np.array([True, False, True]))
with pytest.raises(ValueError):
pa.array(arr, size=3)

# ensure the return value is an Array
class MyArrayInvalid:
def __init__(self, data):
self.data = data

def __arrow_array__(self, type=None):
return np.array(self.data)

arr = MyArrayInvalid(np.array([1, 2, 3], dtype='int64'))
with pytest.raises(ValueError):
pa.array(arr)


def test_concat_array():
concatenated = pa.concat_arrays(
[pa.array([1, 2]), pa.array([3, 4])])
Expand Down
48 changes: 48 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2882,6 +2882,54 @@ def test_variable_dictionary_with_pandas():
a.to_pandas()


# ----------------------------------------------------------------------
# Array protocol in pandas conversions tests


@pytest.mark.skipif(LooseVersion(pd.__version__) < '0.24.0',
reason='IntegerArray only introduced in 0.24')
def test_array_protocol():

def __arrow_array__(self, type=None):
return pa.array(self._data, mask=self._mask, type=type)

df = pd.DataFrame({'a': pd.Series([1, 2, None], dtype='Int64')})

# with latest pandas/arrow, trying to convert nullable integer errors
with pytest.raises(TypeError):
pa.table(df)

try:
# patch IntegerArray with the protocol
pd.arrays.IntegerArray.__arrow_array__ = __arrow_array__

# default conversion
result = pa.table(df)
expected = pa.array([1, 2, None], pa.int64())
assert result[0].chunk(0).equals(expected)

# with specifying schema
schema = pa.schema([('a', pa.float64())])
result = pa.table(df, schema=schema)
expected2 = pa.array([1, 2, None], pa.float64())
assert result[0].chunk(0).equals(expected2)

# pass Series to pa.array
result = pa.array(df['a'])
assert result.equals(expected)
result = pa.array(df['a'], type=pa.float64())
assert result.equals(expected2)

# pass actual ExtensionArray to pa.array
result = pa.array(df['a'].values)
assert result.equals(expected)
result = pa.array(df['a'].values, type=pa.float64())
assert result.equals(expected2)

finally:
del pd.arrays.IntegerArray.__arrow_array__


# ----------------------------------------------------------------------
# Legacy metadata compatibility tests

Expand Down