Skip to content

CLN/ERR: str.cat internals #22725

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 51 additions & 110 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@

from pandas.compat import zip
from pandas.core.dtypes.generic import ABCSeries, ABCIndex
from pandas.core.dtypes.missing import isna, notna
from pandas.core.dtypes.missing import isna
from pandas.core.dtypes.common import (
ensure_object,
is_bool_dtype,
is_categorical_dtype,
is_object_dtype,
Expand Down Expand Up @@ -36,114 +37,26 @@
_shared_docs = dict()


def _get_array_list(arr, others):
"""
Auxiliary function for :func:`str_cat`

Parameters
----------
arr : ndarray
The left-most ndarray of the concatenation
others : list, ndarray, Series
The rest of the content to concatenate. If list of list-likes,
all elements must be passable to ``np.asarray``.

Returns
-------
list
List of all necessary arrays
"""
from pandas.core.series import Series

if len(others) and isinstance(com.values_from_object(others)[0],
(list, np.ndarray, Series)):
arrays = [arr] + list(others)
else:
arrays = [arr, others]

return [np.asarray(x, dtype=object) for x in arrays]


def str_cat(arr, others=None, sep=None, na_rep=None):
def cat_core(list_of_columns, sep):
"""
Auxiliary function for :meth:`str.cat`

If `others` is specified, this function concatenates the Series/Index
and elements of `others` element-wise.
If `others` is not being passed then all values in the Series are
concatenated in a single string with a given `sep`.

Parameters
----------
others : list-like, or list of list-likes, optional
List-likes (or a list of them) of the same length as calling object.
If None, returns str concatenating strings of the Series.
sep : string or None, default None
If None, concatenates without any separator.
na_rep : string or None, default None
If None, NA in the series are ignored.
list_of_columns : list of numpy arrays
List of arrays to be concatenated with sep;
these arrays may not contain NaNs!
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is with this documented limitation? Believe in master that NaN is valid and returns NaN when passed in

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd
the new helper function cat_core has nothing to do with the existing (essentially fully-fledged up to index-handling) str_cat. It's an internal function to avoid lots of copied code, and (as it just wraps np.sum) is not nan-safe for string values.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd
Does that answer your question? what would you like to see for this docstring?

sep : string
The separator string for concatenating the columns

Returns
-------
concat
ndarray containing concatenated results (if `others is not None`)
or str (if `others is None`)
nd.array
The concatenation of list_of_columns with sep
"""
if sep is None:
sep = ''

if others is not None:
arrays = _get_array_list(arr, others)

n = _length_check(arrays)
masks = np.array([isna(x) for x in arrays])
cats = None

if na_rep is None:
na_mask = np.logical_or.reduce(masks, axis=0)

result = np.empty(n, dtype=object)
np.putmask(result, na_mask, np.nan)

notmask = ~na_mask

tuples = zip(*[x[notmask] for x in arrays])
cats = [sep.join(tup) for tup in tuples]

result[notmask] = cats
else:
for i, x in enumerate(arrays):
x = np.where(masks[i], na_rep, x)
if cats is None:
cats = x
else:
cats = cats + sep + x

result = cats

return result
else:
arr = np.asarray(arr, dtype=object)
mask = isna(arr)
if na_rep is None and mask.any():
if sep == '':
na_rep = ''
else:
return sep.join(arr[notna(arr)])
return sep.join(np.where(mask, na_rep, arr))


def _length_check(others):
n = None
for x in others:
try:
if n is None:
n = len(x)
elif len(x) != n:
raise ValueError('All arrays must be same length')
except TypeError:
raise ValueError('Must pass arrays containing strings to str_cat')
return n
list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
list_with_sep[::2] = list_of_columns
return np.sum(list_with_sep, axis=0)


def _na_map(f, arr, na_result=np.nan, dtype=object):
Expand Down Expand Up @@ -2283,6 +2196,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):

if isinstance(others, compat.string_types):
raise ValueError("Did you mean to supply a `sep` keyword?")
if sep is None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not anything you need to change in this PR but if we just overwrite None here it would probably make more sense to change the default function signature to simply be ''

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed - thought about that as well, but didn't know if that's considered an API change.

sep = ''

if isinstance(self._orig, Index):
data = Series(self._orig, index=self._orig)
Expand All @@ -2291,9 +2206,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):

# concatenate Series/Index with itself if no "others"
if others is None:
result = str_cat(data, others=others, sep=sep, na_rep=na_rep)
return self._wrap_result(result,
use_codes=(not self._is_categorical))
data = ensure_object(data)
na_mask = isna(data)
if na_rep is None and na_mask.any():
data = data[~na_mask]
elif na_rep is not None and na_mask.any():
data = np.where(na_mask, na_rep, data)
return sep.join(data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we do a single sep.join, and just have the branches mask the data as needed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


try:
# turn anything in "others" into lists of Series
Expand All @@ -2320,23 +2239,45 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
"'outer'|'inner'|'right'`. The future default will "
"be `join='left'`.", FutureWarning, stacklevel=2)

# if join is None, _get_series_list already force-aligned indexes
join = 'left' if join is None else join

# align if required
if join is not None:
if any(not data.index.equals(x.index) for x in others):
# Need to add keys for uniqueness in case of duplicate columns
others = concat(others, axis=1,
join=(join if join == 'inner' else 'outer'),
keys=range(len(others)))
keys=range(len(others)), copy=False)
data, others = data.align(others, join=join)
others = [others[x] for x in others] # again list of Series

# str_cat discards index
res = str_cat(data, others=others, sep=sep, na_rep=na_rep)
all_cols = [ensure_object(x) for x in [data] + others]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming the index is aligned here can we alternately just concat the columns together and call ensure_object on the entire frame instead of using a list comprehension?

na_masks = np.array([isna(x) for x in all_cols])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment above but should be able to use pd.isnull instead of a list comp with isna

union_mask = np.logical_or.reduce(na_masks, axis=0)

if na_rep is None and union_mask.any():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment on these cases

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added comments

# no na_rep means NaNs for all rows where any column has a NaN
# only necessary if there are actually any NaNs
result = np.empty(len(data), dtype=object)
np.putmask(result, union_mask, np.nan)

not_masked = ~union_mask
result[not_masked] = cat_core([x[not_masked] for x in all_cols],
sep)
elif na_rep is not None and union_mask.any():
# fill NaNs with na_rep in case there are actually any NaNs
all_cols = [np.where(nm, na_rep, col)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Readability nit but can you change nm to na_mask? Makes it clearer what is going on (on initial glance misread nm to be something name related)

for nm, col in zip(na_masks, all_cols)]
result = cat_core(all_cols, sep)
else:
# no NaNs - can just concatenate
result = cat_core(all_cols, sep)

if isinstance(self._orig, Index):
res = Index(res, name=self._orig.name)
result = Index(result, name=self._orig.name)
else: # Series
res = Series(res, index=data.index, name=self._orig.name)
return res
result = Series(result, index=data.index, name=self._orig.name)
return result

_shared_docs['str_split'] = ("""
Split strings around given separator/delimiter.
Expand Down
53 changes: 6 additions & 47 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,53 +97,6 @@ def test_iter_object_try_string(self):
assert i == 100
assert s == 'h'

def test_cat(self):
one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_)
two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_)

# single array
result = strings.str_cat(one)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback

I removed one test for the internal method that has been factored away

Please have a look here - this is directly importing the internal method and testing it (not str.cat)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

exp = 'aabbc'
assert result == exp

result = strings.str_cat(one, na_rep='NA')
exp = 'aabbcNA'
assert result == exp

result = strings.str_cat(one, na_rep='-')
exp = 'aabbc-'
assert result == exp

result = strings.str_cat(one, sep='_', na_rep='NA')
exp = 'a_a_b_b_c_NA'
assert result == exp

result = strings.str_cat(two, sep='-')
exp = 'a-b-d-foo'
assert result == exp

# Multiple arrays
result = strings.str_cat(one, [two], na_rep='NA')
exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'],
dtype=np.object_)
tm.assert_numpy_array_equal(result, exp)

result = strings.str_cat(one, two)
exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_)
tm.assert_almost_equal(result, exp)

# error for incorrect lengths
rgx = 'All arrays must be same length'
three = Series(['1', '2', '3'])

with tm.assert_raises_regex(ValueError, rgx):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Furthermore, this removed test (test_cat) is exactly replicated in the test directly below (test_str_cat).

I can't mark lines that are not in the diff, but check out

result = s.str.cat()

I replicated the removed test (acting on strings.test_cat) as a test acting on str.cat within #20347.

strings.str_cat(one, three)

# error for incorrect type
rgx = "Must pass arrays containing strings to str_cat"
with tm.assert_raises_regex(ValueError, rgx):
strings.str_cat(one, 'three')

@pytest.mark.parametrize('box', [Series, Index])
@pytest.mark.parametrize('other', [None, Series, Index])
def test_str_cat_name(self, box, other):
Expand Down Expand Up @@ -414,6 +367,12 @@ def test_str_cat_align_mixed_inputs(self, join):
with tm.assert_raises_regex(ValueError, rgx):
s.str.cat([t, z], join=join)

def test_str_cat_raises(self):
# non-strings hiding behind object dtype
s = Series([1, 2, 3, 4], dtype='object')
with tm.assert_raises_regex(TypeError, "unsupported operand type.*"):
s.str.cat(s)

def test_str_cat_special_cases(self):
s = Series(['a', 'b', 'c', 'd'])
t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1])
Expand Down