-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
CLN/ERR: str.cat internals #22725
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CLN/ERR: str.cat internals #22725
Changes from all commits
5f8890c
285a1f7
28e7859
807f18e
ed27c66
0d3c6d2
36c6240
a97fe67
e58ec9d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,9 @@ | |
|
||
from pandas.compat import zip | ||
from pandas.core.dtypes.generic import ABCSeries, ABCIndex | ||
from pandas.core.dtypes.missing import isna, notna | ||
from pandas.core.dtypes.missing import isna | ||
from pandas.core.dtypes.common import ( | ||
ensure_object, | ||
is_bool_dtype, | ||
is_categorical_dtype, | ||
is_object_dtype, | ||
|
@@ -36,114 +37,26 @@ | |
_shared_docs = dict() | ||
|
||
|
||
def _get_array_list(arr, others): | ||
""" | ||
Auxiliary function for :func:`str_cat` | ||
|
||
Parameters | ||
---------- | ||
arr : ndarray | ||
The left-most ndarray of the concatenation | ||
others : list, ndarray, Series | ||
The rest of the content to concatenate. If list of list-likes, | ||
all elements must be passable to ``np.asarray``. | ||
|
||
Returns | ||
------- | ||
list | ||
List of all necessary arrays | ||
""" | ||
from pandas.core.series import Series | ||
|
||
if len(others) and isinstance(com.values_from_object(others)[0], | ||
(list, np.ndarray, Series)): | ||
arrays = [arr] + list(others) | ||
else: | ||
arrays = [arr, others] | ||
|
||
return [np.asarray(x, dtype=object) for x in arrays] | ||
|
||
|
||
def str_cat(arr, others=None, sep=None, na_rep=None): | ||
def cat_core(list_of_columns, sep): | ||
""" | ||
Auxiliary function for :meth:`str.cat` | ||
|
||
If `others` is specified, this function concatenates the Series/Index | ||
and elements of `others` element-wise. | ||
If `others` is not being passed then all values in the Series are | ||
concatenated in a single string with a given `sep`. | ||
|
||
Parameters | ||
---------- | ||
others : list-like, or list of list-likes, optional | ||
List-likes (or a list of them) of the same length as calling object. | ||
If None, returns str concatenating strings of the Series. | ||
sep : string or None, default None | ||
If None, concatenates without any separator. | ||
na_rep : string or None, default None | ||
If None, NA in the series are ignored. | ||
list_of_columns : list of numpy arrays | ||
List of arrays to be concatenated with sep; | ||
these arrays may not contain NaNs! | ||
sep : string | ||
The separator string for concatenating the columns | ||
|
||
Returns | ||
------- | ||
concat | ||
ndarray containing concatenated results (if `others is not None`) | ||
or str (if `others is None`) | ||
nd.array | ||
The concatenation of list_of_columns with sep | ||
""" | ||
if sep is None: | ||
sep = '' | ||
|
||
if others is not None: | ||
arrays = _get_array_list(arr, others) | ||
|
||
n = _length_check(arrays) | ||
masks = np.array([isna(x) for x in arrays]) | ||
cats = None | ||
|
||
if na_rep is None: | ||
na_mask = np.logical_or.reduce(masks, axis=0) | ||
|
||
result = np.empty(n, dtype=object) | ||
np.putmask(result, na_mask, np.nan) | ||
|
||
notmask = ~na_mask | ||
|
||
tuples = zip(*[x[notmask] for x in arrays]) | ||
cats = [sep.join(tup) for tup in tuples] | ||
|
||
result[notmask] = cats | ||
else: | ||
for i, x in enumerate(arrays): | ||
x = np.where(masks[i], na_rep, x) | ||
if cats is None: | ||
cats = x | ||
else: | ||
cats = cats + sep + x | ||
|
||
result = cats | ||
|
||
return result | ||
else: | ||
arr = np.asarray(arr, dtype=object) | ||
mask = isna(arr) | ||
if na_rep is None and mask.any(): | ||
if sep == '': | ||
na_rep = '' | ||
else: | ||
return sep.join(arr[notna(arr)]) | ||
return sep.join(np.where(mask, na_rep, arr)) | ||
|
||
|
||
def _length_check(others): | ||
n = None | ||
for x in others: | ||
try: | ||
if n is None: | ||
n = len(x) | ||
elif len(x) != n: | ||
raise ValueError('All arrays must be same length') | ||
except TypeError: | ||
raise ValueError('Must pass arrays containing strings to str_cat') | ||
return n | ||
list_with_sep = [sep] * (2 * len(list_of_columns) - 1) | ||
list_with_sep[::2] = list_of_columns | ||
return np.sum(list_with_sep, axis=0) | ||
|
||
|
||
def _na_map(f, arr, na_result=np.nan, dtype=object): | ||
|
@@ -2283,6 +2196,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): | |
|
||
if isinstance(others, compat.string_types): | ||
raise ValueError("Did you mean to supply a `sep` keyword?") | ||
if sep is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not anything you need to change in this PR but if we just overwrite There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed - thought about that as well, but didn't know if that's considered an API change. |
||
sep = '' | ||
|
||
if isinstance(self._orig, Index): | ||
data = Series(self._orig, index=self._orig) | ||
|
@@ -2291,9 +2206,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): | |
|
||
# concatenate Series/Index with itself if no "others" | ||
if others is None: | ||
result = str_cat(data, others=others, sep=sep, na_rep=na_rep) | ||
return self._wrap_result(result, | ||
use_codes=(not self._is_categorical)) | ||
data = ensure_object(data) | ||
na_mask = isna(data) | ||
if na_rep is None and na_mask.any(): | ||
data = data[~na_mask] | ||
elif na_rep is not None and na_mask.any(): | ||
data = np.where(na_mask, na_rep, data) | ||
return sep.join(data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we do a single sep.join, and just have the branches mask the data as needed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
try: | ||
# turn anything in "others" into lists of Series | ||
|
@@ -2320,23 +2239,45 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): | |
"'outer'|'inner'|'right'`. The future default will " | ||
"be `join='left'`.", FutureWarning, stacklevel=2) | ||
|
||
# if join is None, _get_series_list already force-aligned indexes | ||
join = 'left' if join is None else join | ||
|
||
# align if required | ||
if join is not None: | ||
if any(not data.index.equals(x.index) for x in others): | ||
# Need to add keys for uniqueness in case of duplicate columns | ||
others = concat(others, axis=1, | ||
join=(join if join == 'inner' else 'outer'), | ||
keys=range(len(others))) | ||
keys=range(len(others)), copy=False) | ||
data, others = data.align(others, join=join) | ||
others = [others[x] for x in others] # again list of Series | ||
|
||
# str_cat discards index | ||
res = str_cat(data, others=others, sep=sep, na_rep=na_rep) | ||
all_cols = [ensure_object(x) for x in [data] + others] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assuming the index is aligned here can we alternately just concat the columns together and call |
||
na_masks = np.array([isna(x) for x in all_cols]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar comment above but should be able to use |
||
union_mask = np.logical_or.reduce(na_masks, axis=0) | ||
|
||
if na_rep is None and union_mask.any(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment on these cases There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added comments |
||
# no na_rep means NaNs for all rows where any column has a NaN | ||
# only necessary if there are actually any NaNs | ||
result = np.empty(len(data), dtype=object) | ||
np.putmask(result, union_mask, np.nan) | ||
|
||
not_masked = ~union_mask | ||
result[not_masked] = cat_core([x[not_masked] for x in all_cols], | ||
sep) | ||
elif na_rep is not None and union_mask.any(): | ||
# fill NaNs with na_rep in case there are actually any NaNs | ||
all_cols = [np.where(nm, na_rep, col) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Readability nit but can you change |
||
for nm, col in zip(na_masks, all_cols)] | ||
result = cat_core(all_cols, sep) | ||
else: | ||
# no NaNs - can just concatenate | ||
result = cat_core(all_cols, sep) | ||
|
||
if isinstance(self._orig, Index): | ||
res = Index(res, name=self._orig.name) | ||
result = Index(result, name=self._orig.name) | ||
else: # Series | ||
res = Series(res, index=data.index, name=self._orig.name) | ||
return res | ||
result = Series(result, index=data.index, name=self._orig.name) | ||
return result | ||
|
||
_shared_docs['str_split'] = (""" | ||
Split strings around given separator/delimiter. | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -97,53 +97,6 @@ def test_iter_object_try_string(self): | |||
assert i == 100 | ||||
assert s == 'h' | ||||
|
||||
def test_cat(self): | ||||
one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_) | ||||
two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_) | ||||
|
||||
# single array | ||||
result = strings.str_cat(one) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Please have a look here - this is directly importing the internal method and testing it (not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||||
exp = 'aabbc' | ||||
assert result == exp | ||||
|
||||
result = strings.str_cat(one, na_rep='NA') | ||||
exp = 'aabbcNA' | ||||
assert result == exp | ||||
|
||||
result = strings.str_cat(one, na_rep='-') | ||||
exp = 'aabbc-' | ||||
assert result == exp | ||||
|
||||
result = strings.str_cat(one, sep='_', na_rep='NA') | ||||
exp = 'a_a_b_b_c_NA' | ||||
assert result == exp | ||||
|
||||
result = strings.str_cat(two, sep='-') | ||||
exp = 'a-b-d-foo' | ||||
assert result == exp | ||||
|
||||
# Multiple arrays | ||||
result = strings.str_cat(one, [two], na_rep='NA') | ||||
exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'], | ||||
dtype=np.object_) | ||||
tm.assert_numpy_array_equal(result, exp) | ||||
|
||||
result = strings.str_cat(one, two) | ||||
exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) | ||||
tm.assert_almost_equal(result, exp) | ||||
|
||||
# error for incorrect lengths | ||||
rgx = 'All arrays must be same length' | ||||
three = Series(['1', '2', '3']) | ||||
|
||||
with tm.assert_raises_regex(ValueError, rgx): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I can't mark lines that are not in the diff, but check out pandas/pandas/tests/test_strings.py Line 119 in fcd11b5
I replicated the removed test (acting on |
||||
strings.str_cat(one, three) | ||||
|
||||
# error for incorrect type | ||||
rgx = "Must pass arrays containing strings to str_cat" | ||||
with tm.assert_raises_regex(ValueError, rgx): | ||||
strings.str_cat(one, 'three') | ||||
|
||||
@pytest.mark.parametrize('box', [Series, Index]) | ||||
@pytest.mark.parametrize('other', [None, Series, Index]) | ||||
def test_str_cat_name(self, box, other): | ||||
|
@@ -414,6 +367,12 @@ def test_str_cat_align_mixed_inputs(self, join): | |||
with tm.assert_raises_regex(ValueError, rgx): | ||||
s.str.cat([t, z], join=join) | ||||
|
||||
def test_str_cat_raises(self): | ||||
# non-strings hiding behind object dtype | ||||
s = Series([1, 2, 3, 4], dtype='object') | ||||
with tm.assert_raises_regex(TypeError, "unsupported operand type.*"): | ||||
s.str.cat(s) | ||||
|
||||
def test_str_cat_special_cases(self): | ||||
s = Series(['a', 'b', 'c', 'd']) | ||||
t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is with this documented limitation? Believe in master that
NaN
is valid and returnsNaN
when passed inThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@WillAyd
the new helper function
cat_core
has nothing to do with the existing (essentially fully-fledged up to index-handling)str_cat
. It's an internal function to avoid lots of copied code, and (as it just wrapsnp.sum
) is not nan-safe for string values.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@WillAyd
Does that answer your question? what would you like to see for this docstring?