-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-3829: [Python] add __arrow_array__ protocol to support third-party array classes in conversion to Arrow #5106
Changes from 1 commit
c82eb88
198d699
e2b10c4
4861154
8ac304a
8e70995
bab01f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -161,7 +161,9 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, | |
else: | ||
c_from_pandas = from_pandas | ||
|
||
if _is_array_like(obj): | ||
if hasattr(obj, '__arrow_array__'): | ||
return obj.__arrow_array__(type=type) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we raise if non-default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that sounds as a good idea. |
||
elif _is_array_like(obj): | ||
if mask is not None: | ||
# out argument unused | ||
mask = get_series_values(mask, &is_pandas_object) | ||
|
@@ -178,7 +180,9 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, | |
mask = values.mask | ||
values = values.data | ||
|
||
if pandas_api.is_categorical(values): | ||
if hasattr(values, '__arrow_array__'): | ||
return values.__arrow_array__(type=type) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here: raise if |
||
elif pandas_api.is_categorical(values): | ||
return DictionaryArray.from_arrays( | ||
values.codes, values.categories.values, | ||
mask=mask, ordered=values.ordered, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2882,6 +2882,50 @@ def test_variable_dictionary_with_pandas(): | |
a.to_pandas() | ||
|
||
|
||
# ---------------------------------------------------------------------- | ||
# Array protocol in pandas conversions tests | ||
|
||
|
||
def test_array_protocol(): | ||
|
||
def __arrow_array__(self, type=None): | ||
return pa.array(self._data, mask=self._mask, type=type) | ||
|
||
df = pd.DataFrame({'a': pd.Series([1, 2, None], dtype='Int64')}) | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# with latest pandas/arrow, trying to convert nullable integer errors | ||
with pytest.raises(TypeError): | ||
pa.table(df) | ||
|
||
# patch IntegerArray with the protocol | ||
pd.arrays.IntegerArray.__arrow_array__ = __arrow_array__ | ||
|
||
# default conversion | ||
result = pa.table(df) | ||
expected = pa.array([1, 2, None], pa.int64()) | ||
assert result[0].chunk(0).equals(expected) | ||
|
||
# with specifying schema | ||
schema = pa.schema([('a', pa.float64())]) | ||
result = pa.table(df, schema=schema) | ||
expected2 = pa.array([1, 2, None], pa.float64()) | ||
assert result[0].chunk(0).equals(expected2) | ||
|
||
# pass Series to pa.array | ||
result = pa.array(df['a']) | ||
assert result.equals(expected) | ||
result = pa.array(df['a'], type=pa.float64()) | ||
assert result.equals(expected2) | ||
|
||
# pass actual ExtensionArray to pa.array | ||
result = pa.array(df['a'].values) | ||
assert result.equals(expected) | ||
result = pa.array(df['a'].values, type=pa.float64()) | ||
assert result.equals(expected2) | ||
|
||
del pd.arrays.IntegerArray.__arrow_array__ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe this could go in a try-finally to avoid a failure here possibly contaminating other tests. |
||
|
||
|
||
# ---------------------------------------------------------------------- | ||
# Legacy metadata compatibility tests | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make sense to check for a return value of
NotImplemented
and fall back to the default path?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think so. Why would you define
__arrow_array__
just to returnNotImplemented
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I think if you don't want to implement this, the user should simply not define
__arrow_array__
.But we should probably add a check to ensure that was is returned from
__arrow_array__
is actually a pyarrow Array (that is something that numpy also does).