Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: Allow other na values in StringArray Constructor #45168

Merged
merged 18 commits into from
Jan 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions asv_bench/benchmarks/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import numpy as np

from pandas import (
NA,
Categorical,
DataFrame,
Series,
)
from pandas.arrays import StringArray

from .pandas_vb_common import tm

Expand Down Expand Up @@ -285,3 +287,18 @@ class Iter(Dtypes):
def time_iter(self, dtype):
for i in self.s:
pass


class StringArrayConstruction:
def setup(self):
self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])

def time_string_array_construction(self):
StringArray(self.series_arr)

def time_string_array_with_nan_construction(self):
StringArray(self.series_arr_nan)

def peakmem_stringarray_construction(self):
StringArray(self.series_arr)
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ enhancement2

Other enhancements
^^^^^^^^^^^^^^^^^^
- :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
- Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`)
-

Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ def astype_intsafe(
arr: npt.NDArray[np.object_],
new_dtype: np.dtype,
) -> np.ndarray: ...
def convert_nans_to_NA(
arr: npt.NDArray[np.object_],
) -> npt.NDArray[np.object_]: ...
def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ...

# TODO: can we be more specific about rows?
Expand Down
38 changes: 34 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,40 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:

return result

ctypedef fused ndarr_object:
ndarray[object, ndim=1]
ndarray[object, ndim=2]

# TODO: get rid of this in StringArray and modify
jreback marked this conversation as resolved.
Show resolved Hide resolved
# and go through ensure_string_array instead
@cython.wraparound(False)
@cython.boundscheck(False)
def convert_nans_to_NA(ndarr_object arr) -> ndarray:
"""
Helper for StringArray that converts null values that
are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
have already been validated as null.
"""
cdef:
Py_ssize_t i, m, n
object val
ndarr_object result
result = np.asarray(arr, dtype="object")
jreback marked this conversation as resolved.
Show resolved Hide resolved
if arr.ndim == 2:
m, n = arr.shape[0], arr.shape[1]
for i in range(m):
for j in range(n):
val = arr[i, j]
if not isinstance(val, str):
result[i, j] = <object>C_NA
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
else:
n = len(arr)
for i in range(n):
val = arr[i]
if not isinstance(val, str):
result[i] = <object>C_NA
return result


@cython.wraparound(False)
@cython.boundscheck(False)
Expand Down Expand Up @@ -1880,10 +1914,6 @@ cdef class StringValidator(Validator):
cdef inline bint is_array_typed(self) except -1:
return issubclass(self.dtype.type, np.str_)

cdef bint is_valid_null(self, object value) except -1:
# We deliberately exclude None / NaN here since StringArray uses NA
return value is C_NA


cpdef bint is_string_array(ndarray values, bint skipna=False):
cdef:
Expand Down
19 changes: 16 additions & 3 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,18 @@ class StringArray(BaseStringArray, PandasArray):
.. warning::

Currently, this expects an object-dtype ndarray
where the elements are Python strings or :attr:`pandas.NA`.
where the elements are Python strings
or nan-likes (``None``, ``np.nan``, ``NA``).
This may change without warning in the future. Use
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
creating a `StringArray` from any sequence.

.. versionchanged:: 1.5.0

StringArray now accepts array-likes containing
nan-likes(``None``, ``np.nan``) for the ``values`` parameter
in addition to strings and :attr:`pandas.NA`

copy : bool, default False
Whether to copy the array of data.

Expand Down Expand Up @@ -310,11 +317,11 @@ def __init__(self, values, copy=False):
values = extract_array(values)

super().__init__(values, copy=copy)
if not isinstance(values, type(self)):
self._validate()
# error: Incompatible types in assignment (expression has type "StringDtype",
# variable has type "PandasDtype")
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
if not isinstance(values, type(self)):
self._validate()

def _validate(self):
"""Validate that we only store NA or strings."""
Expand All @@ -325,6 +332,12 @@ def _validate(self):
"StringArray requires a sequence of strings or pandas.NA. Got "
f"'{self._ndarray.dtype}' dtype instead."
)
# Check to see if need to convert Na values to pd.NA
if self._ndarray.ndim > 2:
# Ravel if ndims > 2 b/c no cythonized version available
lib.convert_nans_to_NA(self._ndarray.ravel("K"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isnt the ravel unnecessary bc convert_nans_to_NA supports 2D?

Copy link
Member Author

@lithomas1 lithomas1 Jan 12, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for > 2D (e.g. 3d stuff). IIUC, pandas doesn't support > 2D but the fancy indexing tests that check for an error in the 3D case, expect that a 3D StringArray creates correctly for some reason only to raise later in the series constructor.

Previously failing tests here: https://github.com/pandas-dev/pandas/runs/4697417795?check_suite_focus=true

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just always ravel and then reshape?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for all ndim

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just always ravel and then reshape?

risks making copies

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kk, don't luv the ndim > 3 here at all (i get your reasoning) but as this will be refactored would be good to simplify.

else:
lib.convert_nans_to_NA(self._ndarray)

@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,15 +267,20 @@ def test_constructor_raises(cls):
cls(np.array([]))

with pytest.raises(ValueError, match=msg):
cls(np.array(["a", np.nan], dtype=object))

with pytest.raises(ValueError, match=msg):
cls(np.array(["a", None], dtype=object))
cls(np.array(["a", np.datetime64("nat")], dtype=object))

with pytest.raises(ValueError, match=msg):
cls(np.array(["a", pd.NaT], dtype=object))


@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA])
def test_constructor_nan_like(na):
expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
tm.assert_extension_array_equal(
pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
)


@pytest.mark.parametrize("copy", [True, False])
def test_from_sequence_no_mutate(copy, cls, request):
if cls is ArrowStringArray and copy is False:
Expand Down
20 changes: 16 additions & 4 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1534,19 +1534,31 @@ def test_is_numeric_array(self):
assert not lib.is_integer_array(np.array([1, 2.0]))

def test_is_string_array(self):

# We should only be accepting pd.NA, np.nan,
# other floating point nans e.g. float('nan')]
# when skipna is True.
assert lib.is_string_array(np.array(["foo", "bar"]))
assert not lib.is_string_array(
np.array(["foo", "bar", pd.NA], dtype=object), skipna=False
)
assert lib.is_string_array(
np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
)
# NaN is not valid for string array, just NA
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved
assert not lib.is_string_array(
assert lib.is_string_array(
np.array(["foo", "bar", None], dtype=object), skipna=True
)
assert lib.is_string_array(
np.array(["foo", "bar", np.nan], dtype=object), skipna=True
)

assert not lib.is_string_array(
np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True
)
assert not lib.is_string_array(
np.array(["foo", "bar", None], dtype=object), skipna=False
)
assert not lib.is_string_array(
np.array(["foo", "bar", np.nan], dtype=object), skipna=False
)
assert not lib.is_string_array(np.array([1, 2]))

def test_to_object_array_tuples(self):
Expand Down