From c58123c8e6e5b5596ceea2029525677cacda414e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 12 Feb 2018 14:23:31 -0800 Subject: [PATCH 01/25] Added test case for groupby fill methods --- pandas/tests/groupby/test_groupby.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 129ac6b06205c..d3b8d38688682 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2061,6 +2061,25 @@ def test_rank_object_raises(self, ties_method, ascending, na_option, ascending=ascending, na_option=na_option, pct=pct) + @pytest.mark.parametrize("fill_method,limit,exp_vals", [ + ("ffill", None, + [np.nan, np.nan, 'foo', 'foo', 'foo', 'bar', 'bar', 'bar']), + ("ffill", 1, + [np.nan, np.nan, 'foo', 'foo', np.nan, 'bar', 'bar', np.nan]), + ("bfill", None, + ['foo', 'foo', 'foo', 'bar', 'bar', 'bar', np.nan, np.nan]), + ("bfill", 1, + [np.nan, 'foo', 'foo', np.nan, 'bar', 'bar', np.nan, np.nan]) + ]) + def test_group_fill_methods(self, fill_method, limit, exp_vals): + vals = [np.nan, np.nan, 'foo', np.nan, np.nan, 'bar', np.nan, np.nan] + keys = ['a'] * len(vals) + ['b'] * len(vals) + df = DataFrame({'key': keys, 'val': vals * 2}) + result = getattr(df.groupby('key'), fill_method)(limit=limit) + + exp = DataFrame({'key': keys, 'val': exp_vals * 2}) + assert_frame_equal(result, exp) + def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], 'name': ['foo', 'bar', 'baz'] * 2}) From 2bc80239b92fa1d4fb23da8b0496739e819045c1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 12 Feb 2018 16:07:37 -0800 Subject: [PATCH 02/25] Added code for group_fillna --- pandas/_libs/groupby_helper.pxi.in | 71 +++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index e03e3af65755b..31f30377ee26a 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -273,7 +273,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{endfor}} #---------------------------------------------------------------------- -# group_nth, group_last, group_rank +# group_nth, group_last, group_rank, group_fillna #---------------------------------------------------------------------- {{py: @@ -574,6 +574,75 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, for i in range(N): out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_fillna_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels, + object method, + int64_t limit): + """Fills values forwards or backwards within a group + + Parameters + ---------- + out : array of {{dest_type2}} values which this method will write its + results to + values : array of {{c_type}} values which may require filling + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + method : {'ffill', 'bfill'} + Direction for fill to be applied (forwards or backwards, respectively) + limit : Consecutive values to fill before stopping, or -1 for no limit + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + Py_ssize_t i, N + ndarray[uint8_t] mask + ndarray[int64_t] sorted_labels + {{dest_type2}} curr_fill_val = {{nan_val}} + int64_t idx, filled_vals=0 + + N, K = ( values).shape + + {{if name=='int64'}} + mask = (values[:, 0] == {{nan_val}}).astype(np.uint8) + {{elif name=='object'}} + mask = np.array([x != x for x in values[:, 0]]).astype(np.uint8) + {{else}} + mask = np.isnan(values[:, 0]).astype(np.uint8) + {{endif}} + + sorted_labels = np.argsort(labels) + if method == 'bfill': + sorted_labels[::-1].sort() + + {{if name == 'object'}} + if True: # make templating happy + {{else}} + with nogil: + {{endif}} + for i in range(N): + idx = sorted_labels[i] + if mask[idx]: # is missing + if limit == -1 or filled_vals < limit: + out[idx, 0] = curr_fill_val + else: + out[idx, 0] == {{nan_val}} + filled_vals += 1 + else: # reset items when not missing + filled_vals = 0 + curr_fill_val = values[idx, 0] + out[idx, 0] = values[idx, 0] + + # If we move to the next group, reset + # the fill_val and counter + if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]: + curr_fill_val = {{nan_val}} + filled_vals = 0 {{endfor}} From 3cb25c014a5031e0dcb3cfcaf755b3ebf2562d54 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 12 Feb 2018 22:46:59 -0800 Subject: [PATCH 03/25] Added ASV benchmarks --- asv_bench/benchmarks/groupby.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 61db39528a5fb..c347442784d41 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -370,11 +370,11 @@ class GroupByMethods(object): param_names = ['dtype', 'method'] params = [['int', 'float'], - ['all', 'any', 'count', 'cumcount', 'cummax', 'cummin', - 'cumprod', 'cumsum', 'describe', 'first', 'head', 'last', 'mad', - 'max', 'min', 'median', 'mean', 'nunique', 'pct_change', 'prod', - 'rank', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', - 'unique', 'value_counts', 'var']] + ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', + 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', + 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', + 'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew', + 'std', 'sum', 'tail', 'unique', 'value_counts', 'var']] def setup(self, dtype, method): ngroups = 1000 From 7fecc1165791ef30f173da47ebe4581f73b78dd0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 12 Feb 2018 22:49:00 -0800 Subject: [PATCH 04/25] Connected GroupBy method to Cython fillna --- pandas/core/groupby.py | 113 ++++++++++++++++++++++++++++++----------- 1 file changed, 82 insertions(+), 31 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b1615f720368d..204a0f55e29a0 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -877,21 +877,28 @@ def apply(self, func, *args, **kwargs): func = self._is_builtin_func(func) - # this is needed so we don't try and wrap strings. If we could - # resolve functions to their callable functions prior, this - # wouldn't be needed - if args or kwargs: - if callable(func): - - @wraps(func) - def f(g): - with np.errstate(all='ignore'): - return func(g, *args, **kwargs) + # Try to go down the Cython path first + try: + f = self.grouper._cython_functions['apply'][func] + return self.grouper._cython_apply(f, self._selected_obj, self.axis, + **kwargs) + except KeyError: + # this is needed so we don't try and wrap strings. If we could + # resolve functions to their callable functions prior, this + # wouldn't be needed + if args or kwargs: + if callable(func): + + @wraps(func) + def f(g): + with np.errstate(all='ignore'): + return func(g, *args, **kwargs) + else: + raise ValueError('func must be a callable if args or ' + 'kwargs are supplied and func is not ' + 'implemented in Cython') else: - raise ValueError('func must be a callable if args or ' - 'kwargs are supplied') - else: - f = func + f = func # ignore SettingWithCopy here in case the user mutates with option_context('mode.chained_assignment', None): @@ -1474,7 +1481,7 @@ def pad(self, limit=None): Series.fillna DataFrame.fillna """ - return self.apply(lambda x: x.ffill(limit=limit)) + return self.apply('ffill', limit=limit) ffill = pad @Substitution(name='groupby') @@ -1494,7 +1501,7 @@ def backfill(self, limit=None): Series.fillna DataFrame.fillna """ - return self.apply(lambda x: x.bfill(limit=limit)) + return self.apply('bfill', limit=limit) bfill = backfill @Substitution(name='groupby') @@ -2034,6 +2041,32 @@ def _get_group_keys(self): self.levels, self.labels) + def _cython_apply(self, f, data, axis, **kwargs): + output = collections.OrderedDict() + for col in data.columns: + if col in self.names: + output[col] = data[col].values + else: + # duplicative of _get_cython_function; needs refactor + dtype_str = data[col].dtype.name + values = data[col].values[:, None] + func = afunc = self._get_func(f['name'], dtype_str) + f = f.get('f') + + def wrapper(*args, **kwargs): + return f(afunc, *args, **kwargs) + + func = wrapper + labels, _, _ = self.group_info + + result = _maybe_fill(np.empty_like(values, dtype=dtype_str), + fill_value=np.nan) + func(result, values, labels, **kwargs) + output[col] = result[:, 0] + + # Ugh + return DataFrame(output, index=data.index) + def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) @@ -2230,6 +2263,22 @@ def get_group_levels(self): kwargs.get('na_option', 'keep') ) } + }, + 'apply': { + 'ffill': { + 'name': 'group_fillna', + 'f': lambda func, a, b, c, **kwargs: func( + a, b, c, + 'ffill', kwargs['limit'] if kwargs['limit'] else -1 + ) + }, + 'bfill': { + 'name': 'group_fillna', + 'f': lambda func, a, b, c, **kwargs: func( + a, b, c, + 'bfill', kwargs['limit'] if kwargs['limit'] else -1 + ) + } } } @@ -2248,27 +2297,28 @@ def _is_builtin_func(self, arg): """ return SelectionMixin._builtin_table.get(arg, arg) - def _get_cython_function(self, kind, how, values, is_numeric): - - dtype_str = values.dtype.name + def _get_func(self, fname, dtype_str=None, is_numeric=False): + # see if there is a fused-type version of function + # only valid for numeric + f = getattr(libgroupby, fname, None) + if f is not None and is_numeric: + return f - def get_func(fname): - # see if there is a fused-type version of function - # only valid for numeric - f = getattr(libgroupby, fname, None) - if f is not None and is_numeric: + # otherwise find dtype-specific version, falling back to object + for dt in [dtype_str, 'object']: + f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None) + if f is not None: return f - # otherwise find dtype-specific version, falling back to object - for dt in [dtype_str, 'object']: - f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None) - if f is not None: - return f + def _get_cython_function(self, kind, how, values, is_numeric): + + dtype_str = values.dtype.name ftype = self._cython_functions[kind][how] if isinstance(ftype, dict): - func = afunc = get_func(ftype['name']) + func = afunc = self._get_func(ftype['name'], dtype_str=dtype_str, + is_numeric=is_numeric) # a sub-function f = ftype.get('f') @@ -2281,7 +2331,8 @@ def wrapper(*args, **kwargs): func = wrapper else: - func = get_func(ftype) + func = self._get_func(ftype, dtype_str=dtype_str, + is_numeric=is_numeric) if func is None: raise NotImplementedError("function is not implemented for this" From 3c2fb366a861479644e441153124c1f7751cbd09 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 12 Feb 2018 23:26:17 -0800 Subject: [PATCH 05/25] Fixed issue when filling Series after GroupBy --- pandas/core/groupby.py | 48 ++++++++++++++++------------ pandas/tests/groupby/test_groupby.py | 15 ++++++--- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 204a0f55e29a0..e36cd78a02181 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2041,31 +2041,37 @@ def _get_group_keys(self): self.levels, self.labels) - def _cython_apply(self, f, data, axis, **kwargs): - output = collections.OrderedDict() - for col in data.columns: - if col in self.names: - output[col] = data[col].values - else: - # duplicative of _get_cython_function; needs refactor - dtype_str = data[col].dtype.name - values = data[col].values[:, None] - func = afunc = self._get_func(f['name'], dtype_str) - f = f.get('f') + def _cython_apply(self, ftype, data, axis, **kwargs): + def _generate_output(ser): + # duplicative of _get_cython_function; needs refactor + dtype_str = ser.dtype.name + values = ser.values[:, None] + func = afunc = self._get_func(ftype['name'], dtype_str) + f = ftype.get('f') - def wrapper(*args, **kwargs): - return f(afunc, *args, **kwargs) + def wrapper(*args, **kwargs): + return f(afunc, *args, **kwargs) - func = wrapper - labels, _, _ = self.group_info + func = wrapper + labels, _, _ = self.group_info + + result = _maybe_fill(np.empty_like(values, dtype=dtype_str), + fill_value=np.nan) + func(result, values, labels, **kwargs) - result = _maybe_fill(np.empty_like(values, dtype=dtype_str), - fill_value=np.nan) - func(result, values, labels, **kwargs) - output[col] = result[:, 0] + return result[:, 0] - # Ugh - return DataFrame(output, index=data.index) + # Using introspection to determine result; not ideal needs refactor + if type(data) is Series: + return Series(_generate_output(data), name=data.name) + else: + output = collections.OrderedDict() + for col in data.columns: + if col in self.names: + output[col] = data[col].values + else: + output[col] = _generate_output(data[col]) + return DataFrame(output, index=data.index) def apply(self, f, data, axis=0): mutated = self.mutated diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d3b8d38688682..e26ef05b0a5db 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2061,6 +2061,7 @@ def test_rank_object_raises(self, ties_method, ascending, na_option, ascending=ascending, na_option=na_option, pct=pct) + @pytest.mark.parametrize("as_series", [True, False]) @pytest.mark.parametrize("fill_method,limit,exp_vals", [ ("ffill", None, [np.nan, np.nan, 'foo', 'foo', 'foo', 'bar', 'bar', 'bar']), @@ -2071,14 +2072,20 @@ def test_rank_object_raises(self, ties_method, ascending, na_option, ("bfill", 1, [np.nan, 'foo', 'foo', np.nan, 'bar', 'bar', np.nan, np.nan]) ]) - def test_group_fill_methods(self, fill_method, limit, exp_vals): + def test_group_fill_methods(self, as_series, fill_method, limit, exp_vals): vals = [np.nan, np.nan, 'foo', np.nan, np.nan, 'bar', np.nan, np.nan] keys = ['a'] * len(vals) + ['b'] * len(vals) df = DataFrame({'key': keys, 'val': vals * 2}) - result = getattr(df.groupby('key'), fill_method)(limit=limit) - exp = DataFrame({'key': keys, 'val': exp_vals * 2}) - assert_frame_equal(result, exp) + if as_series: + result = getattr( + df.groupby('key')['val'], fill_method)(limit=limit) + exp = Series(exp_vals * 2, name='val') + assert_series_equal(result, exp) + else: + result = getattr(df.groupby('key'), fill_method)(limit=limit) + exp = DataFrame({'key': keys, 'val': exp_vals * 2}) + assert_frame_equal(result, exp) def test_dont_clobber_name_column(self): df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], From a52b8c4b9346a5d4a98b179f6bd252901c855e2e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 13 Feb 2018 12:55:52 -0800 Subject: [PATCH 06/25] Added tests to mix group entries; fixed sort bug --- pandas/_libs/groupby_helper.pxi.in | 2 +- pandas/tests/groupby/test_groupby.py | 51 ++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 31f30377ee26a..13dfaaee1b3e7 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -618,7 +618,7 @@ def group_fillna_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, sorted_labels = np.argsort(labels) if method == 'bfill': - sorted_labels[::-1].sort() + sorted_labels = sorted_labels[::-1] {{if name == 'object'}} if True: # make templating happy diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e26ef05b0a5db..2429e9975fc8e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2061,30 +2061,59 @@ def test_rank_object_raises(self, ties_method, ascending, na_option, ascending=ascending, na_option=na_option, pct=pct) + @pytest.mark.parametrize("mix_groupings", [True, False]) @pytest.mark.parametrize("as_series", [True, False]) + @pytest.mark.parametrize("val1,val2", [ + ('foo', 'bar'), (1, 2), (1., 2.)]) @pytest.mark.parametrize("fill_method,limit,exp_vals", [ ("ffill", None, - [np.nan, np.nan, 'foo', 'foo', 'foo', 'bar', 'bar', 'bar']), + [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), ("ffill", 1, - [np.nan, np.nan, 'foo', 'foo', np.nan, 'bar', 'bar', np.nan]), + [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), ("bfill", None, - ['foo', 'foo', 'foo', 'bar', 'bar', 'bar', np.nan, np.nan]), + ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), ("bfill", 1, - [np.nan, 'foo', 'foo', np.nan, 'bar', 'bar', np.nan, np.nan]) + [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) ]) - def test_group_fill_methods(self, as_series, fill_method, limit, exp_vals): - vals = [np.nan, np.nan, 'foo', np.nan, np.nan, 'bar', np.nan, np.nan] - keys = ['a'] * len(vals) + ['b'] * len(vals) - df = DataFrame({'key': keys, 'val': vals * 2}) - + def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, + fill_method, limit, exp_vals): + vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] + _exp_vals = list(exp_vals) + # Overwrite placeholder values + for index, exp_val in enumerate(_exp_vals): + if exp_val == 'val1': + _exp_vals[index] = val1 + elif exp_val == 'val2': + _exp_vals[index] = val2 + + # Need to modify values and expectations depending on the + # Series / DataFrame that we ultimately want to generate + if mix_groupings: # ['a', 'b', 'a, 'b', ...] + keys = ['a', 'b'] * len(vals) + + def interweave(list_obj): + temp = list() + for x in list_obj: + temp.extend([x, x]) + + return temp + + _exp_vals = interweave(_exp_vals) + vals = interweave(vals) + else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] + keys = ['a'] * len(vals) + ['b'] * len(vals) + _exp_vals = _exp_vals * 2 + vals = vals * 2 + + df = DataFrame({'key': keys, 'val': vals}) if as_series: result = getattr( df.groupby('key')['val'], fill_method)(limit=limit) - exp = Series(exp_vals * 2, name='val') + exp = Series(_exp_vals, name='val') assert_series_equal(result, exp) else: result = getattr(df.groupby('key'), fill_method)(limit=limit) - exp = DataFrame({'key': keys, 'val': exp_vals * 2}) + exp = DataFrame({'key': keys, 'val': _exp_vals}) assert_frame_equal(result, exp) def test_dont_clobber_name_column(self): From 16c1823b5ee368ef72e86cae370e15c9c7e5be95 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 13 Feb 2018 15:00:50 -0800 Subject: [PATCH 07/25] Simplied groupby Cython calls for ffill/bfill --- pandas/_libs/groupby_helper.pxi.in | 55 ++++++++++++ pandas/core/groupby.py | 140 +++++++++++------------------ 2 files changed, 106 insertions(+), 89 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 13dfaaee1b3e7..af9d6b926f23f 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -1023,3 +1023,58 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, out[ii] = -1 label_indexer[lab, idxer_slot] = ii + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_fillna_indexer(ndarray[int64_t] out, + ndarray[uint8_t] mask, + ndarray[int64_t] labels, + object method, + int64_t limit): + """Fills values forwards or backwards within a group + + Parameters + ---------- + out : array of int64_t values which this method will write its results to + Missing values will be written to with a value of -1 + mask : array of int64_t values where a 1 indicates a missing value + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + method : {'ffill', 'bfill'} + Direction for fill to be applied (forwards or backwards, respectively) + limit : Consecutive values to fill before stopping, or -1 for no limit + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + Py_ssize_t i, N + ndarray[int64_t] sorted_labels + int64_t curr_fill_idx=-1 + int64_t idx, filled_vals=0 + + N = len(out) + + sorted_labels = np.argsort(labels) + if method == 'bfill': + sorted_labels = sorted_labels[::-1] + + with nogil: + for i in range(N): + idx = sorted_labels[i] + if mask[idx] == 1: # is missing + # Stop filling once we've hit the limit + if filled_vals >= limit and limit != -1: + curr_fill_idx = -1 + filled_vals += 1 + else: # reset items when not missing + filled_vals = 0 + curr_fill_idx = idx + + out[idx] = curr_fill_idx + # If we move to the next group, reset + # the fill_idx and counter + if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]: + curr_fill_idx = -1 + filled_vals = 0 diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e36cd78a02181..5c434d9546b4a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -38,7 +38,7 @@ _ensure_float) from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.generic import ABCSeries -from pandas.core.dtypes.missing import isna, notna, _maybe_fill +from pandas.core.dtypes.missing import isna, isnull, notna, _maybe_fill from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) @@ -877,28 +877,21 @@ def apply(self, func, *args, **kwargs): func = self._is_builtin_func(func) - # Try to go down the Cython path first - try: - f = self.grouper._cython_functions['apply'][func] - return self.grouper._cython_apply(f, self._selected_obj, self.axis, - **kwargs) - except KeyError: - # this is needed so we don't try and wrap strings. If we could - # resolve functions to their callable functions prior, this - # wouldn't be needed - if args or kwargs: - if callable(func): - - @wraps(func) - def f(g): - with np.errstate(all='ignore'): - return func(g, *args, **kwargs) - else: - raise ValueError('func must be a callable if args or ' - 'kwargs are supplied and func is not ' - 'implemented in Cython') + # this is needed so we don't try and wrap strings. If we could + # resolve functions to their callable functions prior, this + # wouldn't be needed + if args or kwargs: + if callable(func): + + @wraps(func) + def f(g): + with np.errstate(all='ignore'): + return func(g, *args, **kwargs) else: - f = func + raise ValueError('func must be a callable if args or ' + 'kwargs are supplied') + else: + f = func # ignore SettingWithCopy here in case the user mutates with option_context('mode.chained_assignment', None): @@ -1464,6 +1457,25 @@ def expanding(self, *args, **kwargs): from pandas.core.window import ExpandingGroupby return ExpandingGroupby(self, *args, **kwargs) + def _fill(self, how, limit=None): + labels, _, _ = self.grouper.group_info + + # Need int value for Cython + if limit is None: + limit = -1 + output = {} + if type(self) is DataFrameGroupBy: + for nm in self.grouper.names: + output[nm] = self.obj[nm].values + for name, obj in self._iterate_slices(): + indexer = np.zeros_like(labels) + mask = isnull(obj.values).view(np.uint8) + libgroupby.group_fillna_indexer(indexer, mask, labels, how, + limit) + output[name] = algorithms.take_nd(obj.values, indexer) + + return self._wrap_transformed_output(output) + @Substitution(name='groupby') def pad(self, limit=None): """ @@ -1481,7 +1493,7 @@ def pad(self, limit=None): Series.fillna DataFrame.fillna """ - return self.apply('ffill', limit=limit) + return self._fill('ffill', limit=limit) ffill = pad @Substitution(name='groupby') @@ -1501,7 +1513,7 @@ def backfill(self, limit=None): Series.fillna DataFrame.fillna """ - return self.apply('bfill', limit=limit) + return self._fill('bfill', limit=limit) bfill = backfill @Substitution(name='groupby') @@ -2041,38 +2053,6 @@ def _get_group_keys(self): self.levels, self.labels) - def _cython_apply(self, ftype, data, axis, **kwargs): - def _generate_output(ser): - # duplicative of _get_cython_function; needs refactor - dtype_str = ser.dtype.name - values = ser.values[:, None] - func = afunc = self._get_func(ftype['name'], dtype_str) - f = ftype.get('f') - - def wrapper(*args, **kwargs): - return f(afunc, *args, **kwargs) - - func = wrapper - labels, _, _ = self.group_info - - result = _maybe_fill(np.empty_like(values, dtype=dtype_str), - fill_value=np.nan) - func(result, values, labels, **kwargs) - - return result[:, 0] - - # Using introspection to determine result; not ideal needs refactor - if type(data) is Series: - return Series(_generate_output(data), name=data.name) - else: - output = collections.OrderedDict() - for col in data.columns: - if col in self.names: - output[col] = data[col].values - else: - output[col] = _generate_output(data[col]) - return DataFrame(output, index=data.index) - def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) @@ -2269,22 +2249,6 @@ def get_group_levels(self): kwargs.get('na_option', 'keep') ) } - }, - 'apply': { - 'ffill': { - 'name': 'group_fillna', - 'f': lambda func, a, b, c, **kwargs: func( - a, b, c, - 'ffill', kwargs['limit'] if kwargs['limit'] else -1 - ) - }, - 'bfill': { - 'name': 'group_fillna', - 'f': lambda func, a, b, c, **kwargs: func( - a, b, c, - 'bfill', kwargs['limit'] if kwargs['limit'] else -1 - ) - } } } @@ -2303,28 +2267,27 @@ def _is_builtin_func(self, arg): """ return SelectionMixin._builtin_table.get(arg, arg) - def _get_func(self, fname, dtype_str=None, is_numeric=False): - # see if there is a fused-type version of function - # only valid for numeric - f = getattr(libgroupby, fname, None) - if f is not None and is_numeric: - return f - - # otherwise find dtype-specific version, falling back to object - for dt in [dtype_str, 'object']: - f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None) - if f is not None: - return f - def _get_cython_function(self, kind, how, values, is_numeric): dtype_str = values.dtype.name + def get_func(fname): + # see if there is a fused-type version of function + # only valid for numeric + f = getattr(libgroupby, fname, None) + if f is not None and is_numeric: + return f + + # otherwise find dtype-specific version, falling back to object + for dt in [dtype_str, 'object']: + f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None) + if f is not None: + return f + ftype = self._cython_functions[kind][how] if isinstance(ftype, dict): - func = afunc = self._get_func(ftype['name'], dtype_str=dtype_str, - is_numeric=is_numeric) + func = afunc = get_func(ftype['name']) # a sub-function f = ftype.get('f') @@ -2337,8 +2300,7 @@ def wrapper(*args, **kwargs): func = wrapper else: - func = self._get_func(ftype, dtype_str=dtype_str, - is_numeric=is_numeric) + func = get_func(ftype) if func is None: raise NotImplementedError("function is not implemented for this" From bd3d5e0f5f079305fb1f495690e8ed57f27e5ae4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 13 Feb 2018 15:18:58 -0800 Subject: [PATCH 08/25] Removed abandoned Cython implementation --- pandas/_libs/groupby_helper.pxi.in | 71 +----------------------------- 1 file changed, 1 insertion(+), 70 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index af9d6b926f23f..4d005a23a2e03 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -273,7 +273,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{endfor}} #---------------------------------------------------------------------- -# group_nth, group_last, group_rank, group_fillna +# group_nth, group_last, group_rank #---------------------------------------------------------------------- {{py: @@ -574,75 +574,6 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, for i in range(N): out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_fillna_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, - ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels, - object method, - int64_t limit): - """Fills values forwards or backwards within a group - - Parameters - ---------- - out : array of {{dest_type2}} values which this method will write its - results to - values : array of {{c_type}} values which may require filling - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` - method : {'ffill', 'bfill'} - Direction for fill to be applied (forwards or backwards, respectively) - limit : Consecutive values to fill before stopping, or -1 for no limit - - Notes - ----- - This method modifies the `out` parameter rather than returning an object - """ - cdef: - Py_ssize_t i, N - ndarray[uint8_t] mask - ndarray[int64_t] sorted_labels - {{dest_type2}} curr_fill_val = {{nan_val}} - int64_t idx, filled_vals=0 - - N, K = ( values).shape - - {{if name=='int64'}} - mask = (values[:, 0] == {{nan_val}}).astype(np.uint8) - {{elif name=='object'}} - mask = np.array([x != x for x in values[:, 0]]).astype(np.uint8) - {{else}} - mask = np.isnan(values[:, 0]).astype(np.uint8) - {{endif}} - - sorted_labels = np.argsort(labels) - if method == 'bfill': - sorted_labels = sorted_labels[::-1] - - {{if name == 'object'}} - if True: # make templating happy - {{else}} - with nogil: - {{endif}} - for i in range(N): - idx = sorted_labels[i] - if mask[idx]: # is missing - if limit == -1 or filled_vals < limit: - out[idx, 0] = curr_fill_val - else: - out[idx, 0] == {{nan_val}} - filled_vals += 1 - else: # reset items when not missing - filled_vals = 0 - curr_fill_val = values[idx, 0] - out[idx, 0] = values[idx, 0] - - # If we move to the next group, reset - # the fill_val and counter - if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]: - curr_fill_val = {{nan_val}} - filled_vals = 0 {{endfor}} From cae65af85fc9c4d4c00585ed82953db855092a0c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 13 Feb 2018 15:21:15 -0800 Subject: [PATCH 09/25] Added upcast to int64 to prevent 32 bit failures --- pandas/_libs/groupby_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 4d005a23a2e03..b4db6a08c8c45 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -987,7 +987,7 @@ def group_fillna_indexer(ndarray[int64_t] out, N = len(out) - sorted_labels = np.argsort(labels) + sorted_labels = np.argsort(labels).view(dtype=np.int64) if method == 'bfill': sorted_labels = sorted_labels[::-1] From 02665142779c2f182b4376d9c46e069e554bd897 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 13 Feb 2018 22:30:51 -0800 Subject: [PATCH 10/25] Fixed issue with reconstructing grouped Series --- pandas/core/groupby.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5c434d9546b4a..e55cb5b658150 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1465,8 +1465,9 @@ def _fill(self, how, limit=None): limit = -1 output = {} if type(self) is DataFrameGroupBy: - for nm in self.grouper.names: - output[nm] = self.obj[nm].values + for grp in self.grouper.groupings: + ser = grp.group_index.take(grp.labels) + output[ser.name] = ser.values for name, obj in self._iterate_slices(): indexer = np.zeros_like(labels) mask = isnull(obj.values).view(np.uint8) From 50dc6906c37c83d7acdd254fbc86f61ba69c98dc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 14 Feb 2018 16:23:51 -0800 Subject: [PATCH 11/25] Changed .view to .astype to avoid 32 bit segfaults --- pandas/_libs/groupby_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b4db6a08c8c45..9a4cb61e306aa 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -987,7 +987,7 @@ def group_fillna_indexer(ndarray[int64_t] out, N = len(out) - sorted_labels = np.argsort(labels).view(dtype=np.int64) + sorted_labels = np.argsort(labels).astype(np.int64, copy=False) if method == 'bfill': sorted_labels = sorted_labels[::-1] From 9fa8e255773a518f6f7623e88e92ad142ddcf797 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 14 Feb 2018 16:24:13 -0800 Subject: [PATCH 12/25] Added whatsnew --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index fd3c3a5a7a301..44e5fa790fcf2 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -689,6 +689,7 @@ Performance Improvements - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) - Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) +- Improved performance of :func:`GroupBy.ffill` and :func:`GroupBy.bfill` (:issue:`11296`) .. _whatsnew_0230.docs: From 5da06d868dc5cf044a84a32dcf74f0924d2b919d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 19 Feb 2018 10:50:33 -0800 Subject: [PATCH 13/25] Aligned group_fillna and group_shift signatures --- pandas/_libs/groupby_helper.pxi.in | 20 ++++--- pandas/core/groupby.py | 92 +++++++++++++++++++++--------- 2 files changed, 75 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 9a4cb61e306aa..b0dca670eea8d 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -906,7 +906,7 @@ def group_cumsum(numeric[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_shift_indexer(int64_t[:] out, int64_t[:] labels, +def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, int ngroups, int periods): cdef: Py_ssize_t N, i, j, ii @@ -957,21 +957,19 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, @cython.wraparound(False) @cython.boundscheck(False) -def group_fillna_indexer(ndarray[int64_t] out, - ndarray[uint8_t] mask, - ndarray[int64_t] labels, - object method, +def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, + ndarray[uint8_t] mask, object direction, int64_t limit): - """Fills values forwards or backwards within a group + """Indexes how to fill values forwards or backwards within a group Parameters ---------- out : array of int64_t values which this method will write its results to Missing values will be written to with a value of -1 - mask : array of int64_t values where a 1 indicates a missing value labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values` - method : {'ffill', 'bfill'} + mask : array of int64_t values where a 1 indicates a missing value + direction : {'ffill', 'bfill'} Direction for fill to be applied (forwards or backwards, respectively) limit : Consecutive values to fill before stopping, or -1 for no limit @@ -987,8 +985,11 @@ def group_fillna_indexer(ndarray[int64_t] out, N = len(out) + # Make sure all arrays are the same size + assert N == len(labels) == len(mask) + sorted_labels = np.argsort(labels).astype(np.int64, copy=False) - if method == 'bfill': + if direction == 'bfill': sorted_labels = sorted_labels[::-1] with nogil: @@ -1004,6 +1005,7 @@ def group_fillna_indexer(ndarray[int64_t] out, curr_fill_idx = idx out[idx] = curr_fill_idx + # If we move to the next group, reset # the fill_idx and counter if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e55cb5b658150..d386966d7c8a1 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1,5 +1,5 @@ import types -from functools import wraps +from functools import wraps, partial import numpy as np import datetime import collections @@ -1457,25 +1457,14 @@ def expanding(self, *args, **kwargs): from pandas.core.window import ExpandingGroupby return ExpandingGroupby(self, *args, **kwargs) - def _fill(self, how, limit=None): - labels, _, _ = self.grouper.group_info - + def _fill(self, direction, limit=None): # Need int value for Cython if limit is None: limit = -1 - output = {} - if type(self) is DataFrameGroupBy: - for grp in self.grouper.groupings: - ser = grp.group_index.take(grp.labels) - output[ser.name] = ser.values - for name, obj in self._iterate_slices(): - indexer = np.zeros_like(labels) - mask = isnull(obj.values).view(np.uint8) - libgroupby.group_fillna_indexer(indexer, mask, labels, how, - limit) - output[name] = algorithms.take_nd(obj.values, indexer) - return self._wrap_transformed_output(output) + return self._get_cythonized_result('group_fillna_indexer', + self.grouper, needs_mask=True, + direction=direction, limit=limit) @Substitution(name='groupby') def pad(self, limit=None): @@ -1863,6 +1852,52 @@ def cummax(self, axis=0, **kwargs): return self._cython_transform('cummax', numeric_only=False) + def _get_cythonized_result(self, how, grouper, needs_mask=False, + needs_ngroups=False, **kwargs): + """Get result for Cythonized functions + + Parameters + ---------- + how : str, Cythonized function name to be called + grouper : Grouper object containing pertinent group info + needs_mask : bool, default False + Whether boolean mask needs to be part of the Cython call signature + needs_ngroups : bool, default False + Whether number of groups part of the Cython call signature + **kwargs : dict + Extra arguments required for the given function. This method + internally stores an OrderedDict that maps those keywords to + positional arguments before calling the Cython layer + + Returns + ------- + GroupBy object populated with appropriate result(s) + """ + exp_kwds = collections.OrderedDict([ + (('group_fillna_indexer'), ('direction', 'limit')), + (('group_shift_indexer'), ('nperiods',))]) + + labels, _, ngroups = grouper.group_info + output = collections.OrderedDict() + base_func = getattr(libgroupby, how) + + for name, obj in self._iterate_slices(): + indexer = np.zeros_like(labels) + func = partial(base_func, indexer, labels) + if needs_mask: + mask = isnull(obj.values).astype(np.uint8, copy=False) + func = partial(func, mask) + + if needs_ngroups: + func = partial(func, ngroups) + + # Convert any keywords into positional arguments + func = partial(func, *(kwargs[x] for x in exp_kwds[how])) + func() # Call func to modify indexer values in place + output[name] = algorithms.take_nd(obj.values, indexer) + + return self._wrap_transformed_output(output) + @Substitution(name='groupby') @Appender(_doc_template) def shift(self, periods=1, freq=None, axis=0): @@ -1880,17 +1915,10 @@ def shift(self, periods=1, freq=None, axis=0): if freq is not None or axis != 0: return self.apply(lambda x: x.shift(periods, freq, axis)) - labels, _, ngroups = self.grouper.group_info - - # filled in by Cython - indexer = np.zeros_like(labels) - libgroupby.group_shift_indexer(indexer, labels, ngroups, periods) + return self._get_cythonized_result('group_shift_indexer', + self.grouper, needs_ngroups=True, + nperiods=periods) - output = {} - for name, obj in self._iterate_slices(): - output[name] = algorithms.take_nd(obj.values, indexer) - - return self._wrap_transformed_output(output) @Substitution(name='groupby') @Appender(_doc_template) @@ -3597,7 +3625,6 @@ def describe(self, **kwargs): def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): - from functools import partial from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers @@ -4605,9 +4632,18 @@ def _apply_to_column_groupbys(self, func): in self._iterate_column_groupbys()), keys=self._selected_obj.columns, axis=1) + def _fill(self, direction, limit=None): + """Overriden method to concat grouped columns in output""" + res = super()._fill(direction, limit=limit) + output = collections.OrderedDict() + for grp in self.grouper.groupings: + ser = grp.group_index.take(grp.labels) + output[ser.name] = ser.values + + return self._wrap_transformed_output(output).join(res) + def count(self): """ Compute count of group, excluding missing values """ - from functools import partial from pandas.core.dtypes.missing import _isna_ndarraylike as isna data, _ = self._get_data_to_aggregate() From 2fe91a4ac6309becb958e97c6b94e61e1dd2c9e2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 19 Feb 2018 14:09:53 -0800 Subject: [PATCH 14/25] Fixed failing test; list comp for _fill method --- pandas/core/groupby.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index d386966d7c8a1..bf78270f76845 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1882,10 +1882,10 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, base_func = getattr(libgroupby, how) for name, obj in self._iterate_slices(): - indexer = np.zeros_like(labels) + indexer = np.zeros_like(labels, dtype=np.int64) func = partial(base_func, indexer, labels) if needs_mask: - mask = isnull(obj.values).astype(np.uint8, copy=False) + mask = isnull(obj.values).view(np.uint8) func = partial(func, mask) if needs_ngroups: @@ -4633,12 +4633,11 @@ def _apply_to_column_groupbys(self, func): keys=self._selected_obj.columns, axis=1) def _fill(self, direction, limit=None): - """Overriden method to concat grouped columns in output""" + """Overriden method to join grouped columns in output""" res = super()._fill(direction, limit=limit) - output = collections.OrderedDict() - for grp in self.grouper.groupings: - ser = grp.group_index.take(grp.labels) - output[ser.name] = ser.values + output = collections.OrderedDict( + (grp.name, grp.group_index.take(grp.labels)) for grp in + self.grouper.groupings) return self._wrap_transformed_output(output).join(res) From 825ba172e22890eb7a4f08b4ae93d7af32cb9489 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 19 Feb 2018 14:26:49 -0800 Subject: [PATCH 15/25] Updated whatsnew --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 44e5fa790fcf2..fcaf46b1c3d71 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -689,7 +689,7 @@ Performance Improvements - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) - Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) -- Improved performance of :func:`GroupBy.ffill` and :func:`GroupBy.bfill` (:issue:`11296`) +- Improved performance of ``GroupBy.ffill`` and ``GroupBy.bfill`` (:issue:`11296`) .. _whatsnew_0230.docs: From 127c71c1afbec3e33391b74e52cf9da77efa569d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 19 Feb 2018 14:28:58 -0800 Subject: [PATCH 16/25] PEP8 fixes --- pandas/core/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bf78270f76845..1c67fabdd3375 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4637,7 +4637,7 @@ def _fill(self, direction, limit=None): res = super()._fill(direction, limit=limit) output = collections.OrderedDict( (grp.name, grp.group_index.take(grp.labels)) for grp in - self.grouper.groupings) + self.grouper.groupings) return self._wrap_transformed_output(output).join(res) From 3a23cd6afa3cba4980b5a5c9e4a7028949b12e93 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 19 Feb 2018 17:32:33 -0800 Subject: [PATCH 17/25] Py27 support with super call --- pandas/core/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 1c67fabdd3375..0f4bb16c01a56 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4634,7 +4634,7 @@ def _apply_to_column_groupbys(self, func): def _fill(self, direction, limit=None): """Overriden method to join grouped columns in output""" - res = super()._fill(direction, limit=limit) + res = super(DataFrameGroupBy, self)._fill(direction, limit=limit) output = collections.OrderedDict( (grp.name, grp.group_index.take(grp.labels)) for grp in self.grouper.groupings) From a363146213cca61d5d823d67c5d6df3eb0098957 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 19 Feb 2018 22:17:54 -0800 Subject: [PATCH 18/25] Fixed LINT issue --- pandas/core/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0f4bb16c01a56..d3e9f88272c71 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1919,7 +1919,6 @@ def shift(self, periods=1, freq=None, axis=0): self.grouper, needs_ngroups=True, nperiods=periods) - @Substitution(name='groupby') @Appender(_doc_template) def head(self, n=5): From fd513c8e9d77e4fd171a39e62ac945a03bf60633 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 20 Feb 2018 10:22:50 -0800 Subject: [PATCH 19/25] Used kwargs to call Cython groupby funcs --- pandas/_libs/groupby_helper.pxi.in | 14 ++++++++------ pandas/core/groupby.py | 13 +++---------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b0dca670eea8d..296106def7c1f 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -907,14 +907,16 @@ def group_cumsum(numeric[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, - int ngroups, int periods): + int64_t ngroups, **kwargs): cdef: Py_ssize_t N, i, j, ii - int offset, sign + int offset, sign, periods int64_t lab, idxer, idxer_slot int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) int64_t[:, :] label_indexer + periods = kwargs['periods'] + N, = ( labels).shape if periods < 0: @@ -958,8 +960,7 @@ def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.wraparound(False) @cython.boundscheck(False) def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, - ndarray[uint8_t] mask, object direction, - int64_t limit): + ndarray[uint8_t] mask, **kwargs): """Indexes how to fill values forwards or backwards within a group Parameters @@ -980,9 +981,10 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, cdef: Py_ssize_t i, N ndarray[int64_t] sorted_labels - int64_t curr_fill_idx=-1 - int64_t idx, filled_vals=0 + int64_t limit, idx, curr_fill_idx=-1, filled_vals=0 + direction = kwargs['direction'] + limit = kwargs['limit'] N = len(out) # Make sure all arrays are the same size diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index d3e9f88272c71..044c7d5f31772 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1865,17 +1865,12 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, needs_ngroups : bool, default False Whether number of groups part of the Cython call signature **kwargs : dict - Extra arguments required for the given function. This method - internally stores an OrderedDict that maps those keywords to - positional arguments before calling the Cython layer + Extra arguments to be passed back to Cython funcs Returns ------- GroupBy object populated with appropriate result(s) """ - exp_kwds = collections.OrderedDict([ - (('group_fillna_indexer'), ('direction', 'limit')), - (('group_shift_indexer'), ('nperiods',))]) labels, _, ngroups = grouper.group_info output = collections.OrderedDict() @@ -1891,9 +1886,7 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, if needs_ngroups: func = partial(func, ngroups) - # Convert any keywords into positional arguments - func = partial(func, *(kwargs[x] for x in exp_kwds[how])) - func() # Call func to modify indexer values in place + func(**kwargs) # Call func to modify indexer values in place output[name] = algorithms.take_nd(obj.values, indexer) return self._wrap_transformed_output(output) @@ -1917,7 +1910,7 @@ def shift(self, periods=1, freq=None, axis=0): return self._get_cythonized_result('group_shift_indexer', self.grouper, needs_ngroups=True, - nperiods=periods) + periods=periods) @Substitution(name='groupby') @Appender(_doc_template) From 776d1b7968526a5178ec009b0ec71c0c6f6468e9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 20 Feb 2018 11:01:35 -0800 Subject: [PATCH 20/25] Docstring for _fill method --- pandas/core/groupby.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 044c7d5f31772..191fd92be893e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1458,6 +1458,27 @@ def expanding(self, *args, **kwargs): return ExpandingGroupby(self, *args, **kwargs) def _fill(self, direction, limit=None): + """Shared function for `pad` and `backfill` to call Cython method + + Parameters + ---------- + direction : {'ffill', 'bfill'} + Direction passed to underlying Cython function. `bfill` will cause + values to be filled backwards. `ffill` and any other values will + default to a forward fill + limit : int, default None + Maximum number of consecutive values to fill. If `None`, this + method will convert to -1 prior to passing to Cython + + Returns + ------- + `Series` or `DataFrame` with filled values + + See Also + -------- + pad + backfill + """ # Need int value for Cython if limit is None: limit = -1 @@ -1869,7 +1890,7 @@ def _get_cythonized_result(self, how, grouper, needs_mask=False, Returns ------- - GroupBy object populated with appropriate result(s) + `Series` or `DataFrame` with filled values """ labels, _, ngroups = grouper.group_info From 33f0d06840cbbf12e6836109212b64f8d9f8c889 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 20 Feb 2018 17:36:45 -0800 Subject: [PATCH 21/25] Cleaned up kwargs passing to Cython layer --- pandas/_libs/groupby_helper.pxi.in | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 296106def7c1f..c97db16d9d656 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -907,16 +907,14 @@ def group_cumsum(numeric[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, - int64_t ngroups, **kwargs): + int ngroups, int periods): cdef: Py_ssize_t N, i, j, ii - int offset, sign, periods + int offset, sign int64_t lab, idxer, idxer_slot int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) int64_t[:, :] label_indexer - periods = kwargs['periods'] - N, = ( labels).shape if periods < 0: @@ -960,7 +958,8 @@ def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.wraparound(False) @cython.boundscheck(False) def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, - ndarray[uint8_t] mask, **kwargs): + ndarray[uint8_t] mask, object direction, + int64_t limit): """Indexes how to fill values forwards or backwards within a group Parameters @@ -981,10 +980,8 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, cdef: Py_ssize_t i, N ndarray[int64_t] sorted_labels - int64_t limit, idx, curr_fill_idx=-1, filled_vals=0 + int64_t idx, curr_fill_idx=-1, filled_vals=0 - direction = kwargs['direction'] - limit = kwargs['limit'] N = len(out) # Make sure all arrays are the same size From 662008a6adfff8d21c48dbbc759cb511b6697daf Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Feb 2018 08:12:09 -0800 Subject: [PATCH 22/25] Idiomatic update - replace join with concat --- pandas/core/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 191fd92be893e..1d7e09048a03a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4652,7 +4652,8 @@ def _fill(self, direction, limit=None): (grp.name, grp.group_index.take(grp.labels)) for grp in self.grouper.groupings) - return self._wrap_transformed_output(output).join(res) + from pandas.core.reshape.concat import concat + return concat((self._wrap_transformed_output(output), res), axis=1) def count(self): """ Compute count of group, excluding missing values """ From 27e24fa36f3781c03dfe179c72f24c62179071ac Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 22 Feb 2018 18:03:13 -0800 Subject: [PATCH 23/25] Moved non-templated funcs to groupby.pyx --- pandas/_libs/groupby.pyx | 216 ++++++++++++++++++++++++++++ pandas/_libs/groupby_helper.pxi.in | 219 ----------------------------- 2 files changed, 216 insertions(+), 219 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 866683ce378ab..e3d208a915225 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -94,5 +94,221 @@ cdef inline float64_t kth_smallest_c(float64_t* a, return a[k] +@cython.boundscheck(False) +@cython.wraparound(False) +def group_median_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, ngroups, size + ndarray[int64_t] _counts + ndarray data + float64_t* ptr + + assert min_count == -1, "'min_count' only used in add and prod" + + ngroups = len(counts) + N, K = ( values).shape + + indexer, _counts = groupsort_indexer(labels, ngroups) + counts[:] = _counts[1:] + + data = np.empty((K, N), dtype=np.float64) + ptr = data.data + + take_2d_axis1_float64_float64(values.T, indexer, out=data) + + with nogil: + + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = median_linear(ptr, size) + ptr += size + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumprod_float64(float64_t[:, :] out, + float64_t[:, :] values, + int64_t[:] labels, + bint is_datetimelike): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + float64_t val + float64_t[:, :] accum + int64_t lab + + N, K = ( values).shape + accum = np.ones_like(values) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] *= val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumsum(numeric[:, :] out, + numeric[:, :] values, + int64_t[:] labels, + is_datetimelike): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + numeric val + numeric[:, :] accum + int64_t lab + + N, K = ( values).shape + accum = np.zeros_like(values) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if numeric == float32_t or numeric == float64_t: + if val == val: + accum[lab, j] += val + out[i, j] = accum[lab, j] + else: + accum[lab, j] += val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, + int ngroups, int periods): + cdef: + Py_ssize_t N, i, j, ii + int offset, sign + int64_t lab, idxer, idxer_slot + int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) + int64_t[:, :] label_indexer + + N, = ( labels).shape + + if periods < 0: + periods = -periods + offset = N - 1 + sign = -1 + elif periods > 0: + offset = 0 + sign = 1 + + if periods == 0: + with nogil: + for i in range(N): + out[i] = i + else: + # array of each previous indexer seen + label_indexer = np.zeros((ngroups, periods), dtype=np.int64) + with nogil: + for i in range(N): + ## reverse iterator if shifting backwards + ii = offset + sign * i + lab = labels[ii] + + # Skip null keys + if lab == -1: + out[ii] = -1 + continue + + label_seen[lab] += 1 + + idxer_slot = label_seen[lab] % periods + idxer = label_indexer[lab, idxer_slot] + + if label_seen[lab] > periods: + out[ii] = idxer + else: + out[ii] = -1 + + label_indexer[lab, idxer_slot] = ii + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, + ndarray[uint8_t] mask, object direction, + int64_t limit): + """Indexes how to fill values forwards or backwards within a group + + Parameters + ---------- + out : array of int64_t values which this method will write its results to + Missing values will be written to with a value of -1 + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + mask : array of int64_t values where a 1 indicates a missing value + direction : {'ffill', 'bfill'} + Direction for fill to be applied (forwards or backwards, respectively) + limit : Consecutive values to fill before stopping, or -1 for no limit + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + Py_ssize_t i, N + ndarray[int64_t] sorted_labels + int64_t idx, curr_fill_idx=-1, filled_vals=0 + + N = len(out) + + # Make sure all arrays are the same size + assert N == len(labels) == len(mask) + + sorted_labels = np.argsort(labels).astype(np.int64, copy=False) + if direction == 'bfill': + sorted_labels = sorted_labels[::-1] + + with nogil: + for i in range(N): + idx = sorted_labels[i] + if mask[idx] == 1: # is missing + # Stop filling once we've hit the limit + if filled_vals >= limit and limit != -1: + curr_fill_idx = -1 + filled_vals += 1 + else: # reset items when not missing + filled_vals = 0 + curr_fill_idx = idx + + out[idx] = curr_fill_idx + + # If we move to the next group, reset + # the fill_idx and counter + if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]: + curr_fill_idx = -1 + filled_vals = 0 + + # generated from template include "groupby_helper.pxi" diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index c97db16d9d656..de802f4a72277 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -791,222 +791,3 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, out[i, j] = mval {{endfor}} - -#---------------------------------------------------------------------- -# other grouping functions not needing a template -#---------------------------------------------------------------------- - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_median_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts - ndarray data - float64_t* ptr - - assert min_count == -1, "'min_count' only used in add and prod" - - ngroups = len(counts) - N, K = ( values).shape - - indexer, _counts = groupsort_indexer(labels, ngroups) - counts[:] = _counts[1:] - - data = np.empty((K, N), dtype=np.float64) - ptr = data.data - - take_2d_axis1_float64_float64(values.T, indexer, out=data) - - with nogil: - - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = median_linear(ptr, size) - ptr += size - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumprod_float64(float64_t[:, :] out, - float64_t[:, :] values, - int64_t[:] labels, - bint is_datetimelike): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - float64_t val - float64_t[:, :] accum - int64_t lab - - N, K = ( values).shape - accum = np.ones_like(values) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - if val == val: - accum[lab, j] *= val - out[i, j] = accum[lab, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumsum(numeric[:, :] out, - numeric[:, :] values, - int64_t[:] labels, - is_datetimelike): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - numeric val - numeric[:, :] accum - int64_t lab - - N, K = ( values).shape - accum = np.zeros_like(values) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - if numeric == float32_t or numeric == float64_t: - if val == val: - accum[lab, j] += val - out[i, j] = accum[lab, j] - else: - accum[lab, j] += val - out[i, j] = accum[lab, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, - int ngroups, int periods): - cdef: - Py_ssize_t N, i, j, ii - int offset, sign - int64_t lab, idxer, idxer_slot - int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) - int64_t[:, :] label_indexer - - N, = ( labels).shape - - if periods < 0: - periods = -periods - offset = N - 1 - sign = -1 - elif periods > 0: - offset = 0 - sign = 1 - - if periods == 0: - with nogil: - for i in range(N): - out[i] = i - else: - # array of each previous indexer seen - label_indexer = np.zeros((ngroups, periods), dtype=np.int64) - with nogil: - for i in range(N): - ## reverse iterator if shifting backwards - ii = offset + sign * i - lab = labels[ii] - - # Skip null keys - if lab == -1: - out[ii] = -1 - continue - - label_seen[lab] += 1 - - idxer_slot = label_seen[lab] % periods - idxer = label_indexer[lab, idxer_slot] - - if label_seen[lab] > periods: - out[ii] = idxer - else: - out[ii] = -1 - - label_indexer[lab, idxer_slot] = ii - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, - ndarray[uint8_t] mask, object direction, - int64_t limit): - """Indexes how to fill values forwards or backwards within a group - - Parameters - ---------- - out : array of int64_t values which this method will write its results to - Missing values will be written to with a value of -1 - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` - mask : array of int64_t values where a 1 indicates a missing value - direction : {'ffill', 'bfill'} - Direction for fill to be applied (forwards or backwards, respectively) - limit : Consecutive values to fill before stopping, or -1 for no limit - - Notes - ----- - This method modifies the `out` parameter rather than returning an object - """ - cdef: - Py_ssize_t i, N - ndarray[int64_t] sorted_labels - int64_t idx, curr_fill_idx=-1, filled_vals=0 - - N = len(out) - - # Make sure all arrays are the same size - assert N == len(labels) == len(mask) - - sorted_labels = np.argsort(labels).astype(np.int64, copy=False) - if direction == 'bfill': - sorted_labels = sorted_labels[::-1] - - with nogil: - for i in range(N): - idx = sorted_labels[i] - if mask[idx] == 1: # is missing - # Stop filling once we've hit the limit - if filled_vals >= limit and limit != -1: - curr_fill_idx = -1 - filled_vals += 1 - else: # reset items when not missing - filled_vals = 0 - curr_fill_idx = idx - - out[idx] = curr_fill_idx - - # If we move to the next group, reset - # the fill_idx and counter - if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]: - curr_fill_idx = -1 - filled_vals = 0 From 6f72476674da2304ad4f8ba684a911c52936c559 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 23 Feb 2018 07:42:41 -0800 Subject: [PATCH 24/25] Code update - swap group_index.take with grouper --- pandas/core/groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 1d7e09048a03a..bd163b53b17be 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4649,8 +4649,7 @@ def _fill(self, direction, limit=None): """Overriden method to join grouped columns in output""" res = super(DataFrameGroupBy, self)._fill(direction, limit=limit) output = collections.OrderedDict( - (grp.name, grp.group_index.take(grp.labels)) for grp in - self.grouper.groupings) + (grp.name, grp.grouper) for grp in self.grouper.groupings) from pandas.core.reshape.concat import concat return concat((self._wrap_transformed_output(output), res), axis=1) From eff660370cceb2c26fef207ae444be9497d063a1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 24 Feb 2018 08:33:52 -0800 Subject: [PATCH 25/25] Rebase and update import --- pandas/core/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bd163b53b17be..852ad04cd8a2e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4651,7 +4651,7 @@ def _fill(self, direction, limit=None): output = collections.OrderedDict( (grp.name, grp.grouper) for grp in self.grouper.groupings) - from pandas.core.reshape.concat import concat + from pandas import concat return concat((self._wrap_transformed_output(output), res), axis=1) def count(self):