From 5f8890c1e6aeeda41e9818ce0c46b259efaa7f2f Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 5 Sep 2018 02:51:16 +0200 Subject: [PATCH 1/9] CLN: str.cat internals --- pandas/core/strings.py | 165 +++++++++++------------------------ pandas/tests/test_strings.py | 49 +---------- 2 files changed, 53 insertions(+), 161 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5a23951145cb4..490e048053719 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -3,7 +3,7 @@ from pandas.compat import zip from pandas.core.dtypes.generic import ABCSeries, ABCIndex -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, @@ -36,114 +36,28 @@ _shared_docs = dict() -def _get_array_list(arr, others): - """ - Auxiliary function for :func:`str_cat` - - Parameters - ---------- - arr : ndarray - The left-most ndarray of the concatenation - others : list, ndarray, Series - The rest of the content to concatenate. If list of list-likes, - all elements must be passable to ``np.asarray``. - - Returns - ------- - list - List of all necessary arrays - """ - from pandas.core.series import Series - - if len(others) and isinstance(com.values_from_object(others)[0], - (list, np.ndarray, Series)): - arrays = [arr] + list(others) - else: - arrays = [arr, others] - - return [np.asarray(x, dtype=object) for x in arrays] - - -def str_cat(arr, others=None, sep=None, na_rep=None): - """ +def interleave_sep(all_cols, sep): + ''' Auxiliary function for :meth:`str.cat` - If `others` is specified, this function concatenates the Series/Index - and elements of `others` element-wise. - If `others` is not being passed then all values in the Series are - concatenated in a single string with a given `sep`. - Parameters ---------- - others : list-like, or list of list-likes, optional - List-likes (or a list of them) of the same length as calling object. - If None, returns str concatenating strings of the Series. - sep : string or None, default None - If None, concatenates without any separator. - na_rep : string or None, default None - If None, NA in the series are ignored. + all_cols : list of numpy arrays + List of arrays to be concatenated with sep + sep : string + The separator string for concatenating the columns Returns ------- - concat - ndarray containing concatenated results (if `others is not None`) - or str (if `others is None`) - """ - if sep is None: - sep = '' - - if others is not None: - arrays = _get_array_list(arr, others) - - n = _length_check(arrays) - masks = np.array([isna(x) for x in arrays]) - cats = None - - if na_rep is None: - na_mask = np.logical_or.reduce(masks, axis=0) - - result = np.empty(n, dtype=object) - np.putmask(result, na_mask, np.nan) - - notmask = ~na_mask - - tuples = zip(*[x[notmask] for x in arrays]) - cats = [sep.join(tup) for tup in tuples] - - result[notmask] = cats - else: - for i, x in enumerate(arrays): - x = np.where(masks[i], na_rep, x) - if cats is None: - cats = x - else: - cats = cats + sep + x - - result = cats - - return result - else: - arr = np.asarray(arr, dtype=object) - mask = isna(arr) - if na_rep is None and mask.any(): - if sep == '': - na_rep = '' - else: - return sep.join(arr[notna(arr)]) - return sep.join(np.where(mask, na_rep, arr)) - - -def _length_check(others): - n = None - for x in others: - try: - if n is None: - n = len(x) - elif len(x) != n: - raise ValueError('All arrays must be same length') - except TypeError: - raise ValueError('Must pass arrays containing strings to str_cat') - return n + list + The list of arrays interleaved with sep; to be fed to np.sum + ''' + if sep == '': + # no need to add empty strings + return all_cols + result = [sep] * (2 * len(all_cols) - 1) + result[::2] = all_cols + return result def _na_map(f, arr, na_result=np.nan, dtype=object): @@ -2283,6 +2197,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): if isinstance(others, compat.string_types): raise ValueError("Did you mean to supply a `sep` keyword?") + if sep is None: + sep = '' if isinstance(self._orig, Index): data = Series(self._orig, index=self._orig) @@ -2291,9 +2207,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # concatenate Series/Index with itself if no "others" if others is None: - result = str_cat(data, others=others, sep=sep, na_rep=na_rep) - return self._wrap_result(result, - use_codes=(not self._is_categorical)) + data = data.astype(object).values + mask = isna(data) + if mask.any(): + if na_rep is None: + return sep.join(data[~mask]) + return sep.join(np.where(mask, na_rep, data)) + return sep.join(data) try: # turn anything in "others" into lists of Series @@ -2320,23 +2240,42 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): "'outer'|'inner'|'right'`. The future default will " "be `join='left'`.", FutureWarning, stacklevel=2) - # align if required - if join is not None: + # if join is None, _get_series_list already aligned indexes + join = 'left' if join is None else join + + if any(not data.index.equals(x.index) for x in others): # Need to add keys for uniqueness in case of duplicate columns others = concat(others, axis=1, join=(join if join == 'inner' else 'outer'), - keys=range(len(others))) + keys=range(len(others)), copy=False) data, others = data.align(others, join=join) others = [others[x] for x in others] # again list of Series - # str_cat discards index - res = str_cat(data, others=others, sep=sep, na_rep=na_rep) + all_cols = [x.astype(object).values for x in [data] + others] + masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(masks, axis=0) + + if na_rep is None and union_mask.any(): + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + all_cols = interleave_sep([x[not_masked] for x in all_cols], sep) + + result[not_masked] = np.sum(all_cols, axis=0) + elif na_rep is not None and union_mask.any(): + # fill NaNs + all_cols = [np.where(masks[i], na_rep, all_cols[i]) + for i in range(len(all_cols))] + result = np.sum(interleave_sep(all_cols, sep), axis=0) + else: # no NaNs + result = np.sum(interleave_sep(all_cols, sep), axis=0) if isinstance(self._orig, Index): - res = Index(res, name=self._orig.name) + result = Index(result, name=self._orig.name) else: # Series - res = Series(res, index=data.index, name=self._orig.name) - return res + result = Series(result, index=data.index, name=self._orig.name) + return result _shared_docs['str_split'] = (""" Split strings around given separator/delimiter. diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index bd450cdcf8054..b9cc153d67c93 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -97,53 +97,6 @@ def test_iter_object_try_string(self): assert i == 100 assert s == 'h' - def test_cat(self): - one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_) - two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_) - - # single array - result = strings.str_cat(one) - exp = 'aabbc' - assert result == exp - - result = strings.str_cat(one, na_rep='NA') - exp = 'aabbcNA' - assert result == exp - - result = strings.str_cat(one, na_rep='-') - exp = 'aabbc-' - assert result == exp - - result = strings.str_cat(one, sep='_', na_rep='NA') - exp = 'a_a_b_b_c_NA' - assert result == exp - - result = strings.str_cat(two, sep='-') - exp = 'a-b-d-foo' - assert result == exp - - # Multiple arrays - result = strings.str_cat(one, [two], na_rep='NA') - exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'], - dtype=np.object_) - tm.assert_numpy_array_equal(result, exp) - - result = strings.str_cat(one, two) - exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) - tm.assert_almost_equal(result, exp) - - # error for incorrect lengths - rgx = 'All arrays must be same length' - three = Series(['1', '2', '3']) - - with tm.assert_raises_regex(ValueError, rgx): - strings.str_cat(one, three) - - # error for incorrect type - rgx = "Must pass arrays containing strings to str_cat" - with tm.assert_raises_regex(ValueError, rgx): - strings.str_cat(one, 'three') - @pytest.mark.parametrize('box', [Series, Index]) @pytest.mark.parametrize('other', [None, Series, Index]) def test_str_cat_name(self, box, other): @@ -3136,7 +3089,7 @@ def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) if compat.PY3: - pytest.raises(TypeError, lhs.str.cat, rhs) + pytest.raises(TypeError, lhs.str.cat, rhs, sep=',') else: result = lhs.str.cat(rhs) expected = Series(np.array( From 285a1f7b8c382e9d36f1971e4d48efe887da815b Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 23 Sep 2018 19:07:36 +0200 Subject: [PATCH 2/9] Review (jreback) --- pandas/core/strings.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 490e048053719..8622a5d705176 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -5,6 +5,7 @@ from pandas.core.dtypes.generic import ABCSeries, ABCIndex from pandas.core.dtypes.missing import isna from pandas.core.dtypes.common import ( + ensure_object, is_bool_dtype, is_categorical_dtype, is_object_dtype, @@ -36,13 +37,13 @@ _shared_docs = dict() -def interleave_sep(all_cols, sep): - ''' +def interleave_sep(list_of_columns, sep): + """ Auxiliary function for :meth:`str.cat` Parameters ---------- - all_cols : list of numpy arrays + list_of_columns : list of numpy arrays List of arrays to be concatenated with sep sep : string The separator string for concatenating the columns @@ -51,12 +52,12 @@ def interleave_sep(all_cols, sep): ------- list The list of arrays interleaved with sep; to be fed to np.sum - ''' + """ if sep == '': # no need to add empty strings - return all_cols - result = [sep] * (2 * len(all_cols) - 1) - result[::2] = all_cols + return list_of_columns + result = [sep] * (2 * len(list_of_columns) - 1) + result[::2] = list_of_columns return result @@ -2207,12 +2208,12 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # concatenate Series/Index with itself if no "others" if others is None: - data = data.astype(object).values + data = ensure_object(data) mask = isna(data) - if mask.any(): - if na_rep is None: - return sep.join(data[~mask]) - return sep.join(np.where(mask, na_rep, data)) + if na_rep is None and mask.any(): + data = data[~mask] + elif na_rep is not None and mask.any(): + data = np.where(mask, na_rep, data) return sep.join(data) try: @@ -2251,11 +2252,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): data, others = data.align(others, join=join) others = [others[x] for x in others] # again list of Series - all_cols = [x.astype(object).values for x in [data] + others] + all_cols = [ensure_object(x) for x in [data] + others] masks = np.array([isna(x) for x in all_cols]) union_mask = np.logical_or.reduce(masks, axis=0) if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs result = np.empty(len(data), dtype=object) np.putmask(result, union_mask, np.nan) @@ -2264,11 +2267,12 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): result[not_masked] = np.sum(all_cols, axis=0) elif na_rep is not None and union_mask.any(): - # fill NaNs - all_cols = [np.where(masks[i], na_rep, all_cols[i]) - for i in range(len(all_cols))] + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [np.where(mask, na_rep, col) + for mask, col in zip(masks, all_cols)] result = np.sum(interleave_sep(all_cols, sep), axis=0) - else: # no NaNs + else: + # no NaNs - can just concatenate result = np.sum(interleave_sep(all_cols, sep), axis=0) if isinstance(self._orig, Index): From 28e78594e91640c1ea8928e1a6ef083cc225f228 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 23 Sep 2018 19:14:05 +0200 Subject: [PATCH 3/9] Refactor interleave_sep to use np.sum --- pandas/core/strings.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 8622a5d705176..3e7abfe56e9d7 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -37,28 +37,29 @@ _shared_docs = dict() -def interleave_sep(list_of_columns, sep): +def cat_core(list_of_columns, sep): """ Auxiliary function for :meth:`str.cat` Parameters ---------- list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! sep : string The separator string for concatenating the columns Returns ------- - list - The list of arrays interleaved with sep; to be fed to np.sum + nd.array + The concatenation of list_of_columns with sep """ if sep == '': # no need to add empty strings - return list_of_columns - result = [sep] * (2 * len(list_of_columns) - 1) - result[::2] = list_of_columns - return result + return np.sum(list_of_columns, axis=0) + list_with_sep = [sep] * (2 * len(list_of_columns) - 1) + list_with_sep[::2] = list_of_columns + return np.sum(list_with_sep, axis=0) def _na_map(f, arr, na_result=np.nan, dtype=object): @@ -2263,17 +2264,16 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): np.putmask(result, union_mask, np.nan) not_masked = ~union_mask - all_cols = interleave_sep([x[not_masked] for x in all_cols], sep) - - result[not_masked] = np.sum(all_cols, axis=0) + result[not_masked] = cat_core([x[not_masked] for x in all_cols], + sep) elif na_rep is not None and union_mask.any(): # fill NaNs with na_rep in case there are actually any NaNs all_cols = [np.where(mask, na_rep, col) for mask, col in zip(masks, all_cols)] - result = np.sum(interleave_sep(all_cols, sep), axis=0) + result = cat_core(all_cols, sep) else: # no NaNs - can just concatenate - result = np.sum(interleave_sep(all_cols, sep), axis=0) + result = cat_core(all_cols, sep) if isinstance(self._orig, Index): result = Index(result, name=self._orig.name) From 807f18e8047ec32e5f76268fdd4461ed0f68a857 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 23 Sep 2018 20:01:49 +0200 Subject: [PATCH 4/9] Lint; more consistent naming for masks --- pandas/core/strings.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3e7abfe56e9d7..29095e90ce14f 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2210,11 +2210,11 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # concatenate Series/Index with itself if no "others" if others is None: data = ensure_object(data) - mask = isna(data) - if na_rep is None and mask.any(): - data = data[~mask] - elif na_rep is not None and mask.any(): - data = np.where(mask, na_rep, data) + na_mask = isna(data) + if na_rep is None and na_mask.any(): + data = data[~na_mask] + elif na_rep is not None and na_mask.any(): + data = np.where(na_mask, na_rep, data) return sep.join(data) try: @@ -2254,8 +2254,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): others = [others[x] for x in others] # again list of Series all_cols = [ensure_object(x) for x in [data] + others] - masks = np.array([isna(x) for x in all_cols]) - union_mask = np.logical_or.reduce(masks, axis=0) + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) if na_rep is None and union_mask.any(): # no na_rep means NaNs for all rows where any column has a NaN @@ -2268,8 +2268,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): sep) elif na_rep is not None and union_mask.any(): # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [np.where(mask, na_rep, col) - for mask, col in zip(masks, all_cols)] + all_cols = [np.where(nm, na_rep, col) + for nm, col in zip(na_masks, all_cols)] result = cat_core(all_cols, sep) else: # no NaNs - can just concatenate From ed27c663681f2e0112109d74f8d450eb56218601 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 23 Sep 2018 20:13:53 +0200 Subject: [PATCH 5/9] Add comment --- pandas/core/strings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 29095e90ce14f..69646e18dd3c3 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2245,6 +2245,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # if join is None, _get_series_list already aligned indexes join = 'left' if join is None else join + # align if required if any(not data.index.equals(x.index) for x in others): # Need to add keys for uniqueness in case of duplicate columns others = concat(others, axis=1, From 0d3c6d21ed9f262acd712072a4bcde12329b51df Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 8 Oct 2018 20:13:15 +0200 Subject: [PATCH 6/9] Review (WillAyd) --- pandas/core/strings.py | 3 --- pandas/tests/test_strings.py | 8 +++++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 69646e18dd3c3..2d5c17f3ac088 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -54,9 +54,6 @@ def cat_core(list_of_columns, sep): nd.array The concatenation of list_of_columns with sep """ - if sep == '': - # no need to add empty strings - return np.sum(list_of_columns, axis=0) list_with_sep = [sep] * (2 * len(list_of_columns) - 1) list_with_sep[::2] = list_of_columns return np.sum(list_with_sep, axis=0) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index b9cc153d67c93..75b1bcb8b2938 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -367,6 +367,12 @@ def test_str_cat_align_mixed_inputs(self, join): with tm.assert_raises_regex(ValueError, rgx): s.str.cat([t, z], join=join) + def test_str_cat_raises(self): + # non-strings hiding behind object dtype + s = Series([1, 2, 3, 4], dtype='object') + with tm.assert_raises_regex(TypeError, "unsupported operand type.*"): + s.str.cat(s) + def test_str_cat_special_cases(self): s = Series(['a', 'b', 'c', 'd']) t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) @@ -3089,7 +3095,7 @@ def test_method_on_bytes(self): lhs = Series(np.array(list('abc'), 'S1').astype(object)) rhs = Series(np.array(list('def'), 'S1').astype(object)) if compat.PY3: - pytest.raises(TypeError, lhs.str.cat, rhs, sep=',') + pytest.raises(TypeError, lhs.str.cat, rhs) else: result = lhs.str.cat(rhs) expected = Series(np.array( From 36c6240857296ddb85858351de4e25791a9ff4d7 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 9 Oct 2018 08:27:33 +0200 Subject: [PATCH 7/9] Review (WillAyd) --- pandas/core/strings.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2d5c17f3ac088..e63dc2b6e7e42 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -37,15 +37,15 @@ _shared_docs = dict() -def cat_core(list_of_columns, sep): +def cat_core(all_cols, sep): """ Auxiliary function for :meth:`str.cat` Parameters ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! + all_cols : two-dimensional numpy array + array of columns to be concatenated with sep; + this array may not contain NaNs! sep : string The separator string for concatenating the columns @@ -54,9 +54,12 @@ def cat_core(list_of_columns, sep): nd.array The concatenation of list_of_columns with sep """ + list_of_columns = np.split(all_cols, all_cols.shape[1], axis=1) list_with_sep = [sep] * (2 * len(list_of_columns) - 1) list_with_sep[::2] = list_of_columns - return np.sum(list_with_sep, axis=0) + # np.split splits into arrays of shape (N, 1); NOT (N,) + # need to reduce dimensionality of result + return np.sum(list_with_sep, axis=0)[:, 0] def _na_map(f, arr, na_result=np.nan, dtype=object): @@ -2239,21 +2242,21 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): "'outer'|'inner'|'right'`. The future default will " "be `join='left'`.", FutureWarning, stacklevel=2) - # if join is None, _get_series_list already aligned indexes - join = 'left' if join is None else join + # concatenate others into DataFrame; need to add keys for uniqueness in + # case of duplicate columns (for join is None, all indexes are already + # the same after _get_series_list, which forces alignment in this case) + others = concat(others, axis=1, + join=(join if join == 'inner' else 'outer'), + keys=range(len(others)), copy=False) # align if required - if any(not data.index.equals(x.index) for x in others): - # Need to add keys for uniqueness in case of duplicate columns - others = concat(others, axis=1, - join=(join if join == 'inner' else 'outer'), - keys=range(len(others)), copy=False) + if not data.index.equals(others.index): data, others = data.align(others, join=join) - others = [others[x] for x in others] # again list of Series - all_cols = [ensure_object(x) for x in [data] + others] - na_masks = np.array([isna(x) for x in all_cols]) - union_mask = np.logical_or.reduce(na_masks, axis=0) + # collect all columns + all_cols = ensure_object(concat([data, others], axis=1, copy=False)) + na_masks = isna(all_cols) + union_mask = np.logical_or.reduce(na_masks, axis=1) if na_rep is None and union_mask.any(): # no na_rep means NaNs for all rows where any column has a NaN @@ -2262,13 +2265,10 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): np.putmask(result, union_mask, np.nan) not_masked = ~union_mask - result[not_masked] = cat_core([x[not_masked] for x in all_cols], - sep) + result[not_masked] = cat_core(all_cols[not_masked], sep) elif na_rep is not None and union_mask.any(): # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [np.where(nm, na_rep, col) - for nm, col in zip(na_masks, all_cols)] - result = cat_core(all_cols, sep) + result = cat_core(np.where(na_masks, na_rep, all_cols), sep) else: # no NaNs - can just concatenate result = cat_core(all_cols, sep) From a97fe67ed33c842fd0d2f92a796d0768271b1bad Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 10 Oct 2018 22:38:16 +0200 Subject: [PATCH 8/9] Add np-compat --- pandas/core/strings.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index e63dc2b6e7e42..ebdd2905e29e1 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -18,6 +18,7 @@ import pandas.core.common as com from pandas.core.algorithms import take_1d import pandas.compat as compat +from pandas.compat.numpy import _np_version_under1p11 from pandas.core.base import NoNewAttributesMixin from pandas.util._decorators import Appender import re @@ -57,9 +58,12 @@ def cat_core(all_cols, sep): list_of_columns = np.split(all_cols, all_cols.shape[1], axis=1) list_with_sep = [sep] * (2 * len(list_of_columns) - 1) list_with_sep[::2] = list_of_columns - # np.split splits into arrays of shape (N, 1); NOT (N,) - # need to reduce dimensionality of result - return np.sum(list_with_sep, axis=0)[:, 0] + res = np.sum(list_with_sep, axis=0) + if not (_np_version_under1p11 and len(res) == 0): + # np.split splits into arrays of shape (N, 1); NOT (N,) + # need to reduce dimensionality of result + res = res[:, 0] + return res def _na_map(f, arr, na_result=np.nan, dtype=object): From e58ec9dfa82a459d9b316b678b77d50fc4901e9e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 12 Oct 2018 08:45:06 +0200 Subject: [PATCH 9/9] Revert using more idiomatic code due to perf --- pandas/core/strings.py | 46 +++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ebdd2905e29e1..4086021bc61a6 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -18,7 +18,6 @@ import pandas.core.common as com from pandas.core.algorithms import take_1d import pandas.compat as compat -from pandas.compat.numpy import _np_version_under1p11 from pandas.core.base import NoNewAttributesMixin from pandas.util._decorators import Appender import re @@ -38,15 +37,15 @@ _shared_docs = dict() -def cat_core(all_cols, sep): +def cat_core(list_of_columns, sep): """ Auxiliary function for :meth:`str.cat` Parameters ---------- - all_cols : two-dimensional numpy array - array of columns to be concatenated with sep; - this array may not contain NaNs! + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! sep : string The separator string for concatenating the columns @@ -55,15 +54,9 @@ def cat_core(all_cols, sep): nd.array The concatenation of list_of_columns with sep """ - list_of_columns = np.split(all_cols, all_cols.shape[1], axis=1) list_with_sep = [sep] * (2 * len(list_of_columns) - 1) list_with_sep[::2] = list_of_columns - res = np.sum(list_with_sep, axis=0) - if not (_np_version_under1p11 and len(res) == 0): - # np.split splits into arrays of shape (N, 1); NOT (N,) - # need to reduce dimensionality of result - res = res[:, 0] - return res + return np.sum(list_with_sep, axis=0) def _na_map(f, arr, na_result=np.nan, dtype=object): @@ -2246,21 +2239,21 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): "'outer'|'inner'|'right'`. The future default will " "be `join='left'`.", FutureWarning, stacklevel=2) - # concatenate others into DataFrame; need to add keys for uniqueness in - # case of duplicate columns (for join is None, all indexes are already - # the same after _get_series_list, which forces alignment in this case) - others = concat(others, axis=1, - join=(join if join == 'inner' else 'outer'), - keys=range(len(others)), copy=False) + # if join is None, _get_series_list already force-aligned indexes + join = 'left' if join is None else join # align if required - if not data.index.equals(others.index): + if any(not data.index.equals(x.index) for x in others): + # Need to add keys for uniqueness in case of duplicate columns + others = concat(others, axis=1, + join=(join if join == 'inner' else 'outer'), + keys=range(len(others)), copy=False) data, others = data.align(others, join=join) + others = [others[x] for x in others] # again list of Series - # collect all columns - all_cols = ensure_object(concat([data, others], axis=1, copy=False)) - na_masks = isna(all_cols) - union_mask = np.logical_or.reduce(na_masks, axis=1) + all_cols = [ensure_object(x) for x in [data] + others] + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) if na_rep is None and union_mask.any(): # no na_rep means NaNs for all rows where any column has a NaN @@ -2269,10 +2262,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): np.putmask(result, union_mask, np.nan) not_masked = ~union_mask - result[not_masked] = cat_core(all_cols[not_masked], sep) + result[not_masked] = cat_core([x[not_masked] for x in all_cols], + sep) elif na_rep is not None and union_mask.any(): # fill NaNs with na_rep in case there are actually any NaNs - result = cat_core(np.where(na_masks, na_rep, all_cols), sep) + all_cols = [np.where(nm, na_rep, col) + for nm, col in zip(na_masks, all_cols)] + result = cat_core(all_cols, sep) else: # no NaNs - can just concatenate result = cat_core(all_cols, sep)