From bd9011aaa04f16be785672e2f6c6f3e0862584d6 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 18 Apr 2020 18:16:33 +0000 Subject: [PATCH 01/31] made nan count when dropna=False --- doc/source/whatsnew/v1.0.3.rst | 1 + pandas/core/algorithms.py | 5 ++--- pandas/core/groupby/generic.py | 3 ++- pandas/tests/base/test_value_counts.py | 8 ++++++++ 4 files changed, 13 insertions(+), 4 deletions(-) mode change 100644 => 100755 pandas/core/groupby/generic.py diff --git a/doc/source/whatsnew/v1.0.3.rst b/doc/source/whatsnew/v1.0.3.rst index 26d06433bda0c..0418845022b5c 100644 --- a/doc/source/whatsnew/v1.0.3.rst +++ b/doc/source/whatsnew/v1.0.3.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +:issue: `25970` Fixed Series.value_counts so that normalize excludes NA values when dropna=False. Contributors ~~~~~~~~~~~~ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 62a3808d36ba2..18a839d71af11 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -695,7 +695,6 @@ def value_counts( # count, remove nulls (from the index), and but the bins result = ii.value_counts(dropna=dropna) - result = result[result.index.notna()] result.index = result.index.astype("interval") result = result.sort_index() @@ -703,8 +702,8 @@ def value_counts( if dropna and (result._values == 0).all(): result = result.iloc[0:0] - # normalizing is by len of all (regardless of dropna) - counts = np.array([len(ii)]) + # normalizing is by len of what gets included in the bins + counts = result._values else: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py old mode 100644 new mode 100755 index c007d4920cbe7..594ebc4e4570c --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -675,7 +675,7 @@ def value_counts( from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers - if bins is not None and not np.iterable(bins): + if bins is not None:# and not np.iterable(bins): # scalar bins cannot be done at top level # in a backward compatible way return self.apply( @@ -684,6 +684,7 @@ def value_counts( sort=sort, ascending=ascending, bins=bins, + dropna=dropna ) ids, _, _ = self.grouper.group_info diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index d45feaff68dde..a66a2d1dafd11 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -190,6 +190,14 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 0 + # handle normalizing bins with NA's properly + # see GH25970 + s2 = Series([1,2,2,3,3,3, np.nan, np.nan, 4, 5]) + intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0]) + expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1,0,2])) + expected_keepna_vals = np.array([0.3, 0.3, 0.2, 0.2]) + tm.assert_series_equal(s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna) + tm.assert_numpy_array_equal(s2.value_counts(dropna=False, normalize=True, bins=3).values, expected_keepna_vals) def test_value_counts_datetime64(index_or_series): klass = index_or_series From d9d5ec15bb1b258e327163498864155becba8857 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 18 Apr 2020 18:18:37 +0000 Subject: [PATCH 02/31] updated changelog --- doc/source/whatsnew/v1.0.3.rst | 0 pandas/tests/base/test_value_counts.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 doc/source/whatsnew/v1.0.3.rst mode change 100644 => 100755 pandas/tests/base/test_value_counts.py diff --git a/doc/source/whatsnew/v1.0.3.rst b/doc/source/whatsnew/v1.0.3.rst old mode 100644 new mode 100755 diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py old mode 100644 new mode 100755 From 86fe7f9a44c150c08e92bbbe62c8829ad270e7b3 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 18 Apr 2020 18:20:19 +0000 Subject: [PATCH 03/31] trivial --- pandas/core/algorithms.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 pandas/core/algorithms.py diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py old mode 100644 new mode 100755 From c34a863abd60649ebef8b1c812f8f0d958c478df Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 18 Apr 2020 23:43:06 +0000 Subject: [PATCH 04/31] added specific test for groupby valuecount interval fix --- doc/source/whatsnew/v1.0.3.rst | 4 ++-- pandas/tests/groupby/test_value_counts.py | 22 +++++++++++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.0.3.rst b/doc/source/whatsnew/v1.0.3.rst index 0418845022b5c..8184d979d2c50 100755 --- a/doc/source/whatsnew/v1.0.3.rst +++ b/doc/source/whatsnew/v1.0.3.rst @@ -22,8 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -:issue: `25970` Fixed Series.value_counts so that normalize excludes NA values when dropna=False. - +Fixed Series.value_counts so that normalize excludes NA values when dropna=False. (:issue:`25970`) +Fixed Dataframe Groupby value_counts with bins (:issue:`32471') Contributors ~~~~~~~~~~~~ diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index c86cb4532bc26..4b12a1e0b2da4 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime +from pandas import DataFrame, Grouper, MultiIndex, Series, cut, date_range, to_datetime import pandas._testing as tm @@ -41,13 +41,12 @@ def seed_df(seed_nans, n, m): ids = [] for seed_nans in [True, False]: for n, m in product((100, 1000), (5, 20)): - df = seed_df(seed_nans, n, m) bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) keys = "1st", "2nd", ["1st", "2nd"] for k, b in product(keys, bins): binned.append((df, k, b, n, m)) - ids.append(f"{k}-{n}-{m}") + ids.append(f"{k}-{n}-{m}-{seed_nans} ") @pytest.mark.slow @@ -71,6 +70,7 @@ def rebuild_index(df): gr = df.groupby(keys, sort=isort) left = gr["3rd"].value_counts(**kwargs) + left.index.names = left.index.names[:-1] + ["3rd"] gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) @@ -81,6 +81,22 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) +def test_groubpy_value_counts_bins(): + # GH32471 + BINS = [0, 20, 80, 100] + df = DataFrame( + [[0, 0], [1, 100], [0, 100], [2, 0], [3, 100]], columns=["key", "score"] + ) + result = df.groupby("key")["score"].value_counts(bins=BINS) + result.sort_index(inplace=True) + intervals = cut(Series([0]), bins=BINS, include_lowest=True).cat.categories + index = MultiIndex.from_product( + [[0, 1, 2, 3], sorted(intervals)], names=("key", None) + ) + expected = Series([1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1], index, name="score") + tm.assert_series_equal(result, expected) + + def test_series_groupby_value_counts_with_grouper(): # GH28479 df = DataFrame( From 5f8eb1d775633732e00e26de66c21199645c6081 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sun, 19 Apr 2020 12:35:30 +0000 Subject: [PATCH 05/31] updated value_count docstrings --- pandas/core/algorithms.py | 14 ++-- pandas/core/base.py | 28 +++++-- pandas/core/groupby/generic.py | 135 ++------------------------------- 3 files changed, 38 insertions(+), 139 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 110f8f95927ee..c2f9f1aa73922 100755 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -663,12 +663,16 @@ def value_counts( ascending : bool, default False Sort in ascending order normalize: bool, default False - If True then compute a relative histogram - bins : integer, optional - Rather than count values, group them into half-open bins, - convenience for pd.cut, only works with numeric data + If True, then compute a relative histogram that outputs the + proportion of each value. + bins : integer or iterable of numeric, optional + Rather than count values, group them into half-open bins. + Only works with numeric data. + If int, interpreted as number of bins and will use pd.cut. + If interable of numeric, will use provided numbers as bin endpoints. dropna : bool, default True - Don't include counts of NaN + Don't include counts of NaN. + If False and NaNs are present, NaN will be a key in the output. Returns ------- diff --git a/pandas/core/base.py b/pandas/core/base.py index ee514888c6331..122cfabd20768 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1176,17 +1176,20 @@ def value_counts( Parameters ---------- normalize : bool, default False - If True then the object returned will contain the relative - frequencies of the unique values. + If True, outputs the relative frequencies of the unique values. sort : bool, default True Sort by frequencies. ascending : bool, default False Sort in ascending order. - bins : int, optional - Rather than count values, group them into half-open bins, - a convenience for ``pd.cut``, only works with numeric data. + bins : integer or iterable of numeric, optional + Rather than count individual values, group them into half-open bins. + Only works with numeric data. + If int, interpreted as number of bins and will use ``pd.cut``. + If interable of numeric, will use provided numbers as bin endpoints. + dropna : bool, default True Don't include counts of NaN. + If False and NaNs are present, NaN will be a key in the output. Returns ------- @@ -1223,8 +1226,10 @@ def value_counts( Bins can be useful for going from a continuous variable to a categorical variable; instead of counting unique - apparitions of values, divide the index in the specified - number of half-open bins. + instances of values, count the number of values that fall + into half-open intervals. + + Bins can be an int. >>> s.value_counts(bins=3) (2.0, 3.0] 2 @@ -1232,6 +1237,15 @@ def value_counts( (3.0, 4.0] 1 dtype: int64 + Bins can also be an iterable of numbers. These numbers are treated + as endpoints for the intervals. + + >>> s.value_counts(bins=[0,2,4,9]) + (2.0, 4.0] 3 + (-0.001, 2.0] 2 + (4.0, 9.0] 0 + dtype: int64 + **dropna** With `dropna` set to `False` we can also see NaN index values. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 594ebc4e4570c..6eaf652ff6ab8 100755 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -7,7 +7,6 @@ """ from collections import abc, namedtuple import copy -from functools import partial from textwrap import dedent import typing from typing import ( @@ -41,11 +40,8 @@ maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_bool, - is_integer_dtype, - is_interval_dtype, is_numeric_dtype, is_object_dtype, is_scalar, @@ -671,129 +667,14 @@ def describe(self, **kwargs): def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - - from pandas.core.reshape.tile import cut - from pandas.core.reshape.merge import _get_join_indexers - - if bins is not None:# and not np.iterable(bins): - # scalar bins cannot be done at top level - # in a backward compatible way - return self.apply( - Series.value_counts, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins, - dropna=dropna - ) - - ids, _, _ = self.grouper.group_info - val = self.obj._values - - # groupby removes null keys from groupings - mask = ids != -1 - ids, val = ids[mask], val[mask] - - if bins is None: - lab, lev = algorithms.factorize(val, sort=True) - llab = lambda lab, inc: lab[inc] - else: - - # lab is a Categorical with categories an IntervalIndex - lab = cut(Series(val), bins, include_lowest=True) - lev = lab.cat.categories - lab = lev.take(lab.cat.codes) - llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] - - if is_interval_dtype(lab): - # TODO: should we do this inside II? - sorter = np.lexsort((lab.left, lab.right, ids)) - else: - sorter = np.lexsort((lab, ids)) - - ids, lab = ids[sorter], lab[sorter] - - # group boundaries are where group ids change - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - - # new values are where sorted labels change - lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) - inc = np.r_[True, lchanges] - inc[idx] = True # group boundaries are also new values - out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts - - # num. of times each group should be repeated - rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) - - # multi-index components - codes = self.grouper.reconstructed_codes - codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] - names = self.grouper.names + [self._selection_name] - - if dropna: - mask = codes[-1] != -1 - if mask.all(): - dropna = False - else: - out, codes = out[mask], [level_codes[mask] for level_codes in codes] - - if normalize: - out = out.astype("float") - d = np.diff(np.r_[idx, len(ids)]) - if dropna: - m = ids[lab == -1] - np.add.at(d, m, -1) - acc = rep(d)[mask] - else: - acc = rep(d) - out /= acc - - if sort and bins is None: - cat = ids[inc][mask] if dropna else ids[inc] - sorter = np.lexsort((out if ascending else -out, cat)) - out, codes[-1] = out[sorter], codes[-1][sorter] - - if bins is None: - mi = MultiIndex( - levels=levels, codes=codes, names=names, verify_integrity=False - ) - - if is_integer_dtype(out): - out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) - - # for compat. with libgroupby.value_counts need to ensure every - # bin is present at every index level, null filled with zeros - diff = np.zeros(len(out), dtype="bool") - for level_codes in codes[:-1]: - diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] - - ncat, nbin = diff.sum(), len(levels[-1]) - - left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] - - right = [diff.cumsum() - 1, codes[-1]] - - _, idx = _get_join_indexers(left, right, sort=False, how="left") - out = np.where(idx != -1, out[idx], 0) - - if sort: - sorter = np.lexsort((out if ascending else -out, left[0])) - out, left[-1] = out[sorter], left[-1][sorter] - - # build the multi-index w/ full levels - def build_codes(lev_codes: np.ndarray) -> np.ndarray: - return np.repeat(lev_codes[diff], nbin) - - codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] - codes.append(left[-1]) - - mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) - - if is_integer_dtype(out): - out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) + return self.apply( + Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) def count(self) -> Series: """ From 127616697d7af37eed5bcc7d7d13e79dbb843db4 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sun, 19 Apr 2020 13:47:58 +0000 Subject: [PATCH 06/31] fixed pep8 style --- doc/source/whatsnew/v1.0.3.rst | 3 ++- pandas/core/base.py | 1 + pandas/tests/base/test_value_counts.py | 14 ++++++++++---- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.0.3.rst b/doc/source/whatsnew/v1.0.3.rst index 8184d979d2c50..7972948d1d08a 100755 --- a/doc/source/whatsnew/v1.0.3.rst +++ b/doc/source/whatsnew/v1.0.3.rst @@ -23,7 +23,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ Fixed Series.value_counts so that normalize excludes NA values when dropna=False. (:issue:`25970`) -Fixed Dataframe Groupby value_counts with bins (:issue:`32471') +Fixed Dataframe Groupby value_counts with bins (:issue:`32471`) + Contributors ~~~~~~~~~~~~ diff --git a/pandas/core/base.py b/pandas/core/base.py index 122cfabd20768..d745aada64cbf 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1258,6 +1258,7 @@ def value_counts( 1.0 1 dtype: int64 """ + result = value_counts( self, sort=sort, diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index a66a2d1dafd11..3b7c8dea2576f 100755 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -192,12 +192,18 @@ def test_value_counts_bins(index_or_series): # handle normalizing bins with NA's properly # see GH25970 - s2 = Series([1,2,2,3,3,3, np.nan, np.nan, 4, 5]) + s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5]) intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0]) - expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1,0,2])) + expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2])) expected_keepna_vals = np.array([0.3, 0.3, 0.2, 0.2]) - tm.assert_series_equal(s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna) - tm.assert_numpy_array_equal(s2.value_counts(dropna=False, normalize=True, bins=3).values, expected_keepna_vals) + tm.assert_series_equal( + s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna + ) + tm.assert_numpy_array_equal( + s2.value_counts(dropna=False, normalize=True, bins=3).values, + expected_keepna_vals, + ) + def test_value_counts_datetime64(index_or_series): klass = index_or_series From a1b7197d3caa598bf0573d753f260c667dbe9768 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sun, 19 Apr 2020 15:06:28 +0000 Subject: [PATCH 07/31] fixed more minor style --- pandas/core/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index d745aada64cbf..0d93967d6ee05 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1184,7 +1184,7 @@ def value_counts( bins : integer or iterable of numeric, optional Rather than count individual values, group them into half-open bins. Only works with numeric data. - If int, interpreted as number of bins and will use ``pd.cut``. + If int, interpreted as number of bins and will use ``pd.cut`` If interable of numeric, will use provided numbers as bin endpoints. dropna : bool, default True From 9c3ede33c598475e29bb041d3f0e3e8e98276c5c Mon Sep 17 00:00:00 2001 From: DataInformer Date: Mon, 20 Apr 2020 21:00:59 +0000 Subject: [PATCH 08/31] added test for na in bins --- pandas/tests/base/test_value_counts.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 3b7c8dea2576f..69dcd687f8505 100755 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -190,8 +190,22 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 0 - # handle normalizing bins with NA's properly - # see GH25970 + +def test_value_counts_bins_nas(): + # GH25970, handle normalizing bins with NA's properly + # First test that NA's are included appropriately + rand_data = np.append( + np.random.randint(1, 5, 50), [np.nan] * np.random.randint(1, 20) + ) + s = Series(rand_data) + assert s.value_counts(dropna=False).index.hasnans + assert not s.value_counts(dropna=True).index.hasnans + assert s.value_counts(dropna=False, bins=3).index.hasnans + assert not s.value_counts(dropna=True, bins=3).index.hasnans + assert s.value_counts(dropna=False, bins=[0, 1, 3, 6]).index.hasnans + assert not s.value_counts(dropna=True, bins=[0, 1, 3, 6]).index.hasnans + + # then verify specific example s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5]) intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0]) expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2])) From 0cff92b85862b069b26c7e3be82ae18ef3556ab2 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Mon, 20 Apr 2020 21:03:54 +0000 Subject: [PATCH 09/31] added release notes to 1.1 --- doc/source/whatsnew/v1.0.3.rst | 3 +-- doc/source/whatsnew/v1.1.0.rst | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.3.rst b/doc/source/whatsnew/v1.0.3.rst index 7972948d1d08a..b7fd746eefba1 100755 --- a/doc/source/whatsnew/v1.0.3.rst +++ b/doc/source/whatsnew/v1.0.3.rst @@ -22,8 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -Fixed Series.value_counts so that normalize excludes NA values when dropna=False. (:issue:`25970`) -Fixed Dataframe Groupby value_counts with bins (:issue:`32471`) + Contributors ~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 03a547fadd7ca..7335245eae1a8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -434,7 +434,8 @@ Performance improvements Bug fixes ~~~~~~~~~ - +Fixed Series.value_counts so that normalize excludes NA values when dropna=False. (:issue:`25970`) +Fixed Dataframe Groupby value_counts with bins (:issue:`32471`) Categorical ^^^^^^^^^^^ From 27aa4603a9e30cbd7ea86c32d3a1d781a870dbe3 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 25 Apr 2020 15:25:10 +0000 Subject: [PATCH 10/31] trying to avoid docstring warning --- pandas/core/algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c2f9f1aa73922..c0569dab903b2 100755 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -415,7 +415,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - return comps.isin(values) # type: ignore + return comps.isin(values) comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) @@ -678,6 +678,7 @@ def value_counts( ------- Series """ + from pandas.core.series import Series name = getattr(values, "name", None) From 27c985612bb62185b737f6debdfab6768aabd1a8 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 25 Apr 2020 15:48:37 +0000 Subject: [PATCH 11/31] trying to avoid docstring warning --- pandas/core/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 0d93967d6ee05..c9cb06e90e3e3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1184,9 +1184,8 @@ def value_counts( bins : integer or iterable of numeric, optional Rather than count individual values, group them into half-open bins. Only works with numeric data. - If int, interpreted as number of bins and will use ``pd.cut`` + If int, interpreted as number of bins and will use `pd.cut`. If interable of numeric, will use provided numbers as bin endpoints. - dropna : bool, default True Don't include counts of NaN. If False and NaNs are present, NaN will be a key in the output. From f5e9aeb08a6085e114aa38238dcc4affe588ffd8 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 27 Jun 2020 15:57:09 -0400 Subject: [PATCH 12/31] include nan count when dropna=False --- pandas/core/algorithms.py | 21 ++++++++++--------- pandas/core/base.py | 16 ++++++++++++--- pandas/tests/base/test_value_counts.py | 28 ++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 12 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e3ca4cc53363..e9313b1988cad 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -693,12 +693,16 @@ def value_counts( ascending : bool, default False Sort in ascending order normalize: bool, default False - If True then compute a relative histogram - bins : integer, optional - Rather than count values, group them into half-open bins, - convenience for pd.cut, only works with numeric data + If True, then compute a relative histogram that outputs the + proportion of each value. + bins : integer or iterable of numeric, optional + Rather than count values, group them into half-open bins. + Only works with numeric data. + If int, interpreted as number of bins and will use pd.cut. + If interable of numeric, will use provided numbers as bin endpoints. dropna : bool, default True - Don't include counts of NaN + Don't include counts of NaN. + If False and NaNs are present, NaN will be a key in the output. Returns ------- @@ -717,9 +721,8 @@ def value_counts( except TypeError as err: raise TypeError("bins argument only works with numeric data.") from err - # count, remove nulls (from the index), and but the bins + # count, remove nulls (from the index), and use the bins result = ii.value_counts(dropna=dropna) - result = result[result.index.notna()] result.index = result.index.astype("interval") result = result.sort_index() @@ -727,8 +730,8 @@ def value_counts( if dropna and (result._values == 0).all(): result = result.iloc[0:0] - # normalizing is by len of all (regardless of dropna) - counts = np.array([len(ii)]) + # normalizing is by len of what gets included in the bins + counts = result._values else: diff --git a/pandas/core/base.py b/pandas/core/base.py index b62ef668df5e1..16c6938ced539 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1180,11 +1180,14 @@ def value_counts( Sort by frequencies. ascending : bool, default False Sort in ascending order. - bins : int, optional - Rather than count values, group them into half-open bins, - a convenience for ``pd.cut``, only works with numeric data. + bins : integer or iterable of numeric, optional + Rather than count individual values, group them into half-open bins. + Only works with numeric data. + If int, interpreted as number of bins and will use `pd.cut`. + If interable of numeric, will use provided numbers as bin endpoints. dropna : bool, default True Don't include counts of NaN. + If False and NaNs are present, NaN will be a key in the output. Returns ------- @@ -1230,6 +1233,13 @@ def value_counts( (3.0, 4.0] 1 dtype: int64 + Bins can also be an iterable of numbers. These numbers are treated + as endpoints for the intervals. + >>> s.value_counts(bins=[0,2,4,9]) + (2.0, 4.0] 3 + (-0.001, 2.0] 2 + (4.0, 9.0] 0 + dtype: int64 **dropna** With `dropna` set to `False` we can also see NaN index values. diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index de04c30432e6f..17e1afe906b0a 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -191,6 +191,34 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 0 +def test_value_counts_bins_nas(): + # GH25970, handle normalizing bins with NA's properly + # First test that NA's are included appropriately + rand_data = np.append( + np.random.randint(1, 5, 50), [np.nan] * np.random.randint(1, 20) + ) + s = Series(rand_data) + assert s.value_counts(dropna=False).index.hasnans + assert not s.value_counts(dropna=True).index.hasnans + assert s.value_counts(dropna=False, bins=3).index.hasnans + assert not s.value_counts(dropna=True, bins=3).index.hasnans + assert s.value_counts(dropna=False, bins=[0, 1, 3, 6]).index.hasnans + assert not s.value_counts(dropna=True, bins=[0, 1, 3, 6]).index.hasnans + + # then verify specific example + s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5]) + intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0]) + expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2])) + expected_keepna_vals = np.array([0.3, 0.3, 0.2, 0.2]) + tm.assert_series_equal( + s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna + ) + tm.assert_numpy_array_equal( + s2.value_counts(dropna=False, normalize=True, bins=3).values, + expected_keepna_vals, + ) + + def test_value_counts_datetime64(index_or_series): klass = index_or_series From 99b7112e51cad358af0e40b7fe9a98ffd6cf6a9e Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 27 Jun 2020 16:56:34 -0400 Subject: [PATCH 13/31] listed bugfix --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c5eb2febe8ae9..61d68064e6b0d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -921,6 +921,7 @@ Numeric - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) +- Bug in :meth:`Series.value_counts` with ``normalize=True`` for NA values (:issue:`25970`) Conversion ^^^^^^^^^^ From 75374b26b7a892ad240e0e7f19196d9667eee702 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 27 Jun 2020 18:16:54 -0400 Subject: [PATCH 14/31] avoided tests that highlight groupby.value_count bug --- pandas/core/base.py | 6 ++++-- pandas/tests/groupby/test_value_counts.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 16c6938ced539..56faaa80ffb74 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1180,7 +1180,7 @@ def value_counts( Sort by frequencies. ascending : bool, default False Sort in ascending order. - bins : integer or iterable of numeric, optional + bins : int or iterable of numeric, optional Rather than count individual values, group them into half-open bins. Only works with numeric data. If int, interpreted as number of bins and will use `pd.cut`. @@ -1235,11 +1235,13 @@ def value_counts( Bins can also be an iterable of numbers. These numbers are treated as endpoints for the intervals. + >>> s.value_counts(bins=[0,2,4,9]) - (2.0, 4.0] 3 + (2.0, 4.0] 3 (-0.001, 2.0] 2 (4.0, 9.0] 0 dtype: int64 + **dropna** With `dropna` set to `False` we can also see NaN index values. diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index c86cb4532bc26..f8640c63ecc6e 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -53,10 +53,10 @@ def seed_df(seed_nans, n, m): @pytest.mark.slow @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) @pytest.mark.parametrize("isort", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("normalize", [False]) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("dropna", [True]) def test_series_groupby_value_counts( df, keys, bins, n, m, isort, normalize, sort, ascending, dropna ): @@ -71,6 +71,7 @@ def rebuild_index(df): gr = df.groupby(keys, sort=isort) left = gr["3rd"].value_counts(**kwargs) + # left.index.names = left.index.names[:-1] + ["3rd"] gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) From 25b6c143b95661b2b426ff9d62e9071c0fd91277 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 4 Jul 2020 13:15:16 -0400 Subject: [PATCH 15/31] Revert "avoided tests that highlight groupby.value_count bug" This reverts commit 75374b26b7a892ad240e0e7f19196d9667eee702. --- pandas/core/base.py | 6 ++---- pandas/tests/groupby/test_value_counts.py | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 56faaa80ffb74..16c6938ced539 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1180,7 +1180,7 @@ def value_counts( Sort by frequencies. ascending : bool, default False Sort in ascending order. - bins : int or iterable of numeric, optional + bins : integer or iterable of numeric, optional Rather than count individual values, group them into half-open bins. Only works with numeric data. If int, interpreted as number of bins and will use `pd.cut`. @@ -1235,13 +1235,11 @@ def value_counts( Bins can also be an iterable of numbers. These numbers are treated as endpoints for the intervals. - >>> s.value_counts(bins=[0,2,4,9]) - (2.0, 4.0] 3 + (2.0, 4.0] 3 (-0.001, 2.0] 2 (4.0, 9.0] 0 dtype: int64 - **dropna** With `dropna` set to `False` we can also see NaN index values. diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index f8640c63ecc6e..c86cb4532bc26 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -53,10 +53,10 @@ def seed_df(seed_nans, n, m): @pytest.mark.slow @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) @pytest.mark.parametrize("isort", [True, False]) -@pytest.mark.parametrize("normalize", [False]) +@pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("dropna", [True]) +@pytest.mark.parametrize("dropna", [True, False]) def test_series_groupby_value_counts( df, keys, bins, n, m, isort, normalize, sort, ascending, dropna ): @@ -71,7 +71,6 @@ def rebuild_index(df): gr = df.groupby(keys, sort=isort) left = gr["3rd"].value_counts(**kwargs) - # left.index.names = left.index.names[:-1] + ["3rd"] gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) From 277ce5275ffcf33d0ccc7c513c5a553fd2597661 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 4 Jul 2020 14:49:22 -0400 Subject: [PATCH 16/31] use series value_counts for groupby --- pandas/core/groupby/generic.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dab8475d9580c..6a97760a92086 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -664,7 +664,16 @@ def describe(self, **kwargs): def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): + return self.apply( + Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) + """ from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers @@ -786,6 +795,7 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: if is_integer_dtype(out): out = ensure_int64(out) return self.obj._constructor(out, index=mi, name=self._selection_name) + """ def count(self) -> Series: """ From 797f66849cc021c405f3527d90b04d2e150b8429 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 4 Jul 2020 16:45:17 -0400 Subject: [PATCH 17/31] added groupby bin test --- pandas/tests/groupby/test_value_counts.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index c86cb4532bc26..4b12a1e0b2da4 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime +from pandas import DataFrame, Grouper, MultiIndex, Series, cut, date_range, to_datetime import pandas._testing as tm @@ -41,13 +41,12 @@ def seed_df(seed_nans, n, m): ids = [] for seed_nans in [True, False]: for n, m in product((100, 1000), (5, 20)): - df = seed_df(seed_nans, n, m) bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) keys = "1st", "2nd", ["1st", "2nd"] for k, b in product(keys, bins): binned.append((df, k, b, n, m)) - ids.append(f"{k}-{n}-{m}") + ids.append(f"{k}-{n}-{m}-{seed_nans} ") @pytest.mark.slow @@ -71,6 +70,7 @@ def rebuild_index(df): gr = df.groupby(keys, sort=isort) left = gr["3rd"].value_counts(**kwargs) + left.index.names = left.index.names[:-1] + ["3rd"] gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) @@ -81,6 +81,22 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) +def test_groubpy_value_counts_bins(): + # GH32471 + BINS = [0, 20, 80, 100] + df = DataFrame( + [[0, 0], [1, 100], [0, 100], [2, 0], [3, 100]], columns=["key", "score"] + ) + result = df.groupby("key")["score"].value_counts(bins=BINS) + result.sort_index(inplace=True) + intervals = cut(Series([0]), bins=BINS, include_lowest=True).cat.categories + index = MultiIndex.from_product( + [[0, 1, 2, 3], sorted(intervals)], names=("key", None) + ) + expected = Series([1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1], index, name="score") + tm.assert_series_equal(result, expected) + + def test_series_groupby_value_counts_with_grouper(): # GH28479 df = DataFrame( From fce6998fa8459f38850c2ec431ffef0bc7e72bc4 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Thu, 16 Jul 2020 15:36:37 -0400 Subject: [PATCH 18/31] passing groupy valcount tests --- pandas/core/algorithms.py | 15 +- pandas/core/groupby/generic.py | 241 +++++++++++------- pandas/tests/groupby/test_value_counts.py | 49 +++- .../tests/series/methods/test_value_counts.py | 7 + 4 files changed, 206 insertions(+), 106 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e9313b1988cad..ba80dfa165f65 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -723,15 +723,15 @@ def value_counts( # count, remove nulls (from the index), and use the bins result = ii.value_counts(dropna=dropna) + print(f"{result=}") result.index = result.index.astype("interval") result = result.sort_index() + """ # if we are dropna and we have NO values if dropna and (result._values == 0).all(): result = result.iloc[0:0] - - # normalizing is by len of what gets included in the bins - counts = result._values + """ else: @@ -740,19 +740,18 @@ def value_counts( # handle Categorical and sparse, result = Series(values)._values.value_counts(dropna=dropna) result.name = name - counts = result._values else: keys, counts = _value_counts_arraylike(values, dropna) result = Series(counts, index=keys, name=name) - if sort: - result = result.sort_values(ascending=ascending) - if normalize: - result = result / float(counts.sum()) + counts = result._values + result = result / float(max(counts.sum(), 1)) + if sort: + result = result.sort_values(ascending=ascending) return result diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index eea076044eeeb..a81be6b28bfce 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -51,6 +51,7 @@ is_scalar, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna, notna from pandas.core.aggregation import ( @@ -664,16 +665,7 @@ def describe(self, **kwargs): def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - return self.apply( - Series.value_counts, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins, - dropna=dropna, - ) - """ from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers @@ -687,115 +679,184 @@ def value_counts( ascending=ascending, bins=bins, ) - + keys = [k for k in self.groups] + # print(f'{self.groups=}') ids, _, _ = self.grouper.group_info + # print(f'{ids=}') val = self.obj._values + print(f"{keys=}") + codes = self.grouper.reconstructed_codes # this will track the groups + print("codes: ", codes) # groupby removes null keys from groupings mask = ids != -1 ids, val = ids[mask], val[mask] + if dropna: + mask = ~np.isnan(val) + if not mask.all(): + ids, val = ids[mask], val[mask] + # codes = [code[mask] for code in codes] + print(f"{ids=}") + print(f"{val=}") + + print(f"{bins=}") if bins is None: - lab, lev = algorithms.factorize(val, sort=True) - llab = lambda lab, inc: lab[inc] + val_lab, val_lev = algorithms.factorize(val, sort=True, dropna=dropna) + print(f"{val_lab=}") else: + # val_lab is a Categorical with categories an IntervalIndex + print(f"{Series(val)=}") + val_lab = cut(Series(val), bins, include_lowest=True) + # cut excludes NaN from its categories, so need to manually add + print(f"{val_lab=}") + print((not dropna) and (val_lab.hasnans)) + """if (not dropna) and (val_lab.hasnans): + # val_lab = + cat_nan = CategoricalDtype(val_lab.cat.add_categories('NaN').cat.categories) + print(cat_nan) + val_lab = val_lab.astype(cat_nan).fillna('NaN') + """ + print(f"{val_lab=}") + val_lev = val_lab.cat.categories + val_lab = val_lab.cat.codes.values + print(f"{val_lab=}") + if dropna: + included = val_lab != -1 + ids, val_lab = ids[included], val_lab[included] - # lab is a Categorical with categories an IntervalIndex - lab = cut(Series(val), bins, include_lowest=True) - lev = lab.cat.categories - lab = lev.take(lab.cat.codes) - llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] - - if is_interval_dtype(lab.dtype): + # print('1st val_lab: ', val_lab.cat.codes) + # llab = lambda val_lab, inc: val_lab[inc]._multiindex.codes[-1] + print(f"{val_lev=}") + if is_interval_dtype(val_lab.dtype): # TODO: should we do this inside II? - sorter = np.lexsort((lab.left, lab.right, ids)) + sorter = np.lexsort((val_lab.right, val_lab.left, ids)) else: - sorter = np.lexsort((lab, ids)) - - ids, lab = ids[sorter], lab[sorter] + sorter = np.lexsort((val_lab, ids)) + ids, val_lab = ids[sorter], val_lab[sorter] + print("ids: ", ids) + print(f"{val_lab=}") + # val_lab = val_lab.values + # print(f'{val_lab=}') # group boundaries are where group ids change - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - # new values are where sorted labels change - lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) - inc = np.r_[True, lchanges] - inc[idx] = True # group boundaries are also new values - out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts - - # num. of times each group should be repeated - rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) - - # multi-index components - codes = self.grouper.reconstructed_codes - codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + change_ids = ids[1:] != ids[:-1] + print((val_lab[1:] != val_lab[:-1])) + changes = np.logical_or(change_ids, (val_lab[1:] != val_lab[:-1])) + """ + changes = [(ids[i] != ids[i+1]) or (val_lab[i] != val_lab[i+1]) + for i in range(len(ids)-1)] #((ids[1:] != ids[:-1]) or (val_lab[1:] != val_lab[:-1])) + """ + print(f"{changes=}") + print(np.diff(np.nonzero(changes), append=len(changes))[0]) + changes = np.r_[True, changes] + cts = np.diff(np.nonzero(np.r_[changes, True]))[0] # , append=len(changes))[0] + print(f"{cts=}") + val_lab = val_lab[changes] + ids = ids[changes] + print("ids: ", ids) + + change_ids = ( + ids[1:] != ids[:-1] + ) # need to update now that we removed full repeats + # num_id_rep = np.diff(np.nonzero(np.r_[True, chan])) + print(f"{change_ids=}") + print(f"{val_lab=}") + + num_repeats = np.diff(np.nonzero(np.r_[True, change_ids, True]))[0] + rep = partial(np.repeat, repeats=num_repeats) + print(f"{rep=}") + if (not dropna) and (-1 in val_lab): + val_lev = np.r_[Index([np.nan]), val_lev] + val_lab += 1 + levels = [ping.group_index for ping in self.grouper.groupings] + [ + Index(val_lev) + ] + print(f"{levels=}") names = self.grouper.names + [self._selection_name] - - if dropna: - mask = codes[-1] != -1 - if mask.all(): - dropna = False - else: - out, codes = out[mask], [level_codes[mask] for level_codes in codes] + print(f"{names=}") if normalize: - out = out.astype("float") - d = np.diff(np.r_[idx, len(ids)]) - if dropna: - m = ids[lab == -1] - np.add.at(d, m, -1) - acc = rep(d)[mask] - else: - acc = rep(d) - out /= acc - - if sort and bins is None: - cat = ids[inc][mask] if dropna else ids[inc] - sorter = np.lexsort((out if ascending else -out, cat)) - out, codes[-1] = out[sorter], codes[-1][sorter] + num_vals = [] + ix = 0 + print(f"{num_repeats=}") + for i, r in enumerate(num_repeats): + num_vals.append(np.sum(cts[ix : ix + r])) + # print(out[ix:ix+r]) + ix += r + # print(f'{ix=}') + # [np.sum(out[i:i+r]) ] + print(f"{num_vals=}") + print(f"{cts=}") + cts = cts.astype("float") + cts /= rep(num_vals) # each divisor is the number of repeats for that index + print(f"{cts=}") if bins is None: + print("codes: ", codes) + # codes = [code[changes] for code in codes] + used_ids = np.unique(ids) + codes = [code[used_ids] for code in codes] + codes = [rep(level_codes) for level_codes in codes] + [val_lab] + print(f"{codes=}") + + if sort: + indices = tuple(reversed(codes[:-1])) + sorter = np.lexsort( + np.r_[[val_lab], [cts if ascending else -cts], indices] + ) # sorts using right columns first + cts = cts[sorter] + codes = [code[sorter] for code in codes] + print(f"{cts=}") mi = MultiIndex( levels=levels, codes=codes, names=names, verify_integrity=False ) + # print(f'{mi=}') + if is_integer_dtype(cts): + cts = ensure_int64(cts) + return self.obj._constructor(cts, index=mi, name=self._selection_name) - if is_integer_dtype(out): - out = ensure_int64(out) - return self.obj._constructor(out, index=mi, name=self._selection_name) + nbin = len(levels[-1]) + # print(f'{codes=}') + print(len(cts), len(codes[0]), len(sorter)) # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros - diff = np.zeros(len(out), dtype="bool") - for level_codes in codes[:-1]: - diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] - - ncat, nbin = diff.sum(), len(levels[-1]) - - left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] - - right = [diff.cumsum() - 1, codes[-1]] - - _, idx = _get_join_indexers(left, right, sort=False, how="left") - out = np.where(idx != -1, out[idx], 0) - + print(f"{ids=}") + ncat = len(codes[0]) + # ncat = len(ids) + print(f"{nbin=}") + fout = np.zeros((ncat * nbin), dtype=float if normalize else np.int64) + for i, ct in enumerate(cts): + fout[ids[i] * nbin + val_lab[i]] = ct + print(f"{fout=}", len(fout)) + + ncodes = [np.repeat(code, nbin) for code in codes] + print(f"{ncodes=}") + ncodes.append(np.tile(range(nbin), len(codes[0]))) + """ + fout = cts + ncodes = [rep(level_codes) for level_codes in codes] + [val_lab] + """ + print(f"{ncodes=}") if sort: - sorter = np.lexsort((out if ascending else -out, left[0])) - out, left[-1] = out[sorter], left[-1][sorter] - - # build the multi-index w/ full levels - def build_codes(lev_codes: np.ndarray) -> np.ndarray: - return np.repeat(lev_codes[diff], nbin) - - codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] - codes.append(left[-1]) - - mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) - - if is_integer_dtype(out): - out = ensure_int64(out) - return self.obj._constructor(out, index=mi, name=self._selection_name) - """ + indices = tuple(reversed(ncodes[:-1])) + print(f"{indices=}") + # print(np.r_[[fout if ascending else -fout], indices]) + sorter = np.lexsort( + np.r_[[fout if ascending else -fout], indices] + ) # sorts using right columns first + # print(sorter) + fout = fout[sorter] + ncodes = [code[sorter] for code in ncodes] + mi = MultiIndex( + levels=levels, codes=ncodes, names=names, verify_integrity=False + ) + print(f"{mi=}") + if is_integer_dtype(fout): + fout = ensure_int64(fout) + return self.obj._constructor(fout, index=mi, name=self._selection_name) def count(self) -> Series: """ diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 4b12a1e0b2da4..6d221fa89de6b 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -40,7 +40,7 @@ def seed_df(seed_nans, n, m): binned = [] ids = [] for seed_nans in [True, False]: - for n, m in product((100, 1000), (5, 20)): + for n, m in product((10, 1000), (5, 20)): df = seed_df(seed_nans, n, m) bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) keys = "1st", "2nd", ["1st", "2nd"] @@ -68,32 +68,65 @@ def rebuild_index(df): normalize=normalize, sort=sort, ascending=ascending, dropna=dropna, bins=bins ) + print(f"{df=}") gr = df.groupby(keys, sort=isort) left = gr["3rd"].value_counts(**kwargs) left.index.names = left.index.names[:-1] + ["3rd"] - gr = df.groupby(keys, sort=isort) + # gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) right.index.names = right.index.names[:-1] + ["3rd"] + print(f"{left=}") + print(f"{right=}") # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 + # have to ignore 0 counts to be consistent with individual column value_counts + left = left[left.astype(bool)] + right = right[right.astype(bool)] tm.assert_series_equal(left.sort_index(), right.sort_index()) def test_groubpy_value_counts_bins(): # GH32471 BINS = [0, 20, 80, 100] - df = DataFrame( - [[0, 0], [1, 100], [0, 100], [2, 0], [3, 100]], columns=["key", "score"] + values = [ + [0, 5, 0], + [1, 5, 100], + [0, 5, 100], + [2, 5, 0], + [3, 6, 100], + [3, 5, 100], + [1, 5, 100], + ] + df = DataFrame(values, columns=["key1", "key2", "score"]) + result = df.groupby(["key1", "key2"])["score"].value_counts(bins=BINS) + print(f"{result=}") + print( + df.groupby(["key1", "key2"])["score"].apply( + Series.value_counts, + bins=BINS, + sort=True, + normalize=True, + ascending=True, + dropna=True, + ) ) - result = df.groupby("key")["score"].value_counts(bins=BINS) + result.sort_index(inplace=True) intervals = cut(Series([0]), bins=BINS, include_lowest=True).cat.categories - index = MultiIndex.from_product( - [[0, 1, 2, 3], sorted(intervals)], names=("key", None) + # groups = [(0,5), (1,5), (2,5), (3,5), (3,6)] + groups = set((v[1], v[2], i) for v in values for i in intervals) + # {val[:-1]: 0 for val in values} + index = product([], intervals) + + """index = MultiIndex.from_product( + [groups, sorted(intervals)], names=("key1", "key2", "score") + )""" + expected = Series( + [1, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 1], result.index, name="score" ) - expected = Series([1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1], index, name="score") + # expected = [2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index f97362ce9c2a9..d965fe4faa075 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -179,6 +179,13 @@ def test_value_counts_categorical_with_nan(self): res = ser.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) + def test_value_counts_interval_bins(self): + ser = Series([1, 2, 3, 0, 1, 4], ["a", "a", "a", "b", "b", "c"]) + res = ser.value_counts(bins=[0, 1, 2]) + print(res) + exp = Series([2, 3, 4]) + tm.assert_series_equal(res, exp) + @pytest.mark.parametrize( "ser, dropna, exp", [ From 637a6098fb058871b40a9b45bff37cd155b4869d Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 25 Jul 2020 23:07:23 -0400 Subject: [PATCH 19/31] nan doesnt work for times --- pandas/core/groupby/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a81be6b28bfce..b980a985c3077 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -692,7 +692,7 @@ def value_counts( mask = ids != -1 ids, val = ids[mask], val[mask] if dropna: - mask = ~np.isnan(val) + mask = ~isna(val) if not mask.all(): ids, val = ids[mask], val[mask] # codes = [code[mask] for code in codes] @@ -797,7 +797,7 @@ def value_counts( print("codes: ", codes) # codes = [code[changes] for code in codes] used_ids = np.unique(ids) - codes = [code[used_ids] for code in codes] + # codes = [code[used_ids] for code in codes] codes = [rep(level_codes) for level_codes in codes] + [val_lab] print(f"{codes=}") From c9a4383563c315b9b6f45eb1c9a20393adee8739 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sun, 26 Jul 2020 21:39:53 -0400 Subject: [PATCH 20/31] passing all value count tests --- pandas/core/groupby/generic.py | 138 ++++++---------------- pandas/tests/base/test_value_counts.py | 29 +++-- pandas/tests/groupby/test_value_counts.py | 14 +-- 3 files changed, 52 insertions(+), 129 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b980a985c3077..3d5e13dbe658d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -45,13 +45,11 @@ ensure_platform_int, is_bool, is_integer_dtype, - is_interval_dtype, is_numeric_dtype, is_object_dtype, is_scalar, needs_i8_conversion, ) -from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna, notna from pandas.core.aggregation import ( @@ -61,6 +59,7 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms +from pandas.core.algorithms import unique from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -78,6 +77,7 @@ import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series +from pandas.core.sorting import compress_group_index from pandas.core.util.numba_ import ( NUMBA_FUNC_CACHE, generate_numba_func, @@ -667,7 +667,6 @@ def value_counts( ): from pandas.core.reshape.tile import cut - from pandas.core.reshape.merge import _get_join_indexers if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level @@ -679,14 +678,9 @@ def value_counts( ascending=ascending, bins=bins, ) - keys = [k for k in self.groups] - # print(f'{self.groups=}') ids, _, _ = self.grouper.group_info - # print(f'{ids=}') val = self.obj._values - print(f"{keys=}") codes = self.grouper.reconstructed_codes # this will track the groups - print("codes: ", codes) # groupby removes null keys from groupings mask = ids != -1 @@ -695,111 +689,63 @@ def value_counts( mask = ~isna(val) if not mask.all(): ids, val = ids[mask], val[mask] - # codes = [code[mask] for code in codes] - print(f"{ids=}") - print(f"{val=}") - - print(f"{bins=}") if bins is None: val_lab, val_lev = algorithms.factorize(val, sort=True, dropna=dropna) - print(f"{val_lab=}") else: # val_lab is a Categorical with categories an IntervalIndex - print(f"{Series(val)=}") val_lab = cut(Series(val), bins, include_lowest=True) - # cut excludes NaN from its categories, so need to manually add - print(f"{val_lab=}") - print((not dropna) and (val_lab.hasnans)) - """if (not dropna) and (val_lab.hasnans): - # val_lab = - cat_nan = CategoricalDtype(val_lab.cat.add_categories('NaN').cat.categories) - print(cat_nan) - val_lab = val_lab.astype(cat_nan).fillna('NaN') - """ - print(f"{val_lab=}") val_lev = val_lab.cat.categories val_lab = val_lab.cat.codes.values - print(f"{val_lab=}") - if dropna: - included = val_lab != -1 - ids, val_lab = ids[included], val_lab[included] - - # print('1st val_lab: ', val_lab.cat.codes) - # llab = lambda val_lab, inc: val_lab[inc]._multiindex.codes[-1] - print(f"{val_lev=}") - if is_interval_dtype(val_lab.dtype): - # TODO: should we do this inside II? - sorter = np.lexsort((val_lab.right, val_lab.left, ids)) - else: - sorter = np.lexsort((val_lab, ids)) + + if dropna: + included = val_lab != -1 + ids, val_lab = ids[included], val_lab[included] + + sorter = np.lexsort((val_lab, ids)) ids, val_lab = ids[sorter], val_lab[sorter] + used_ids = unique(ids) + if max(used_ids) >= len( + codes[0] + ): # this means we had something skipped from the start + used_ids = compress_group_index(used_ids)[0] + codes = [code[used_ids] for code in codes] # drop what was taken out for n/a - print("ids: ", ids) - print(f"{val_lab=}") - # val_lab = val_lab.values - # print(f'{val_lab=}') # group boundaries are where group ids change # new values are where sorted labels change change_ids = ids[1:] != ids[:-1] - print((val_lab[1:] != val_lab[:-1])) changes = np.logical_or(change_ids, (val_lab[1:] != val_lab[:-1])) - """ - changes = [(ids[i] != ids[i+1]) or (val_lab[i] != val_lab[i+1]) - for i in range(len(ids)-1)] #((ids[1:] != ids[:-1]) or (val_lab[1:] != val_lab[:-1])) - """ - print(f"{changes=}") - print(np.diff(np.nonzero(changes), append=len(changes))[0]) changes = np.r_[True, changes] - cts = np.diff(np.nonzero(np.r_[changes, True]))[0] # , append=len(changes))[0] - print(f"{cts=}") val_lab = val_lab[changes] ids = ids[changes] - print("ids: ", ids) - - change_ids = ( - ids[1:] != ids[:-1] - ) # need to update now that we removed full repeats - # num_id_rep = np.diff(np.nonzero(np.r_[True, chan])) - print(f"{change_ids=}") - print(f"{val_lab=}") + cts = np.diff(np.nonzero(np.r_[changes, True]))[0] + idx = np.r_[0, 1 + np.nonzero(change_ids)[0]] + rep = partial(np.repeat, repeats=np.add.reduceat(changes, idx)) num_repeats = np.diff(np.nonzero(np.r_[True, change_ids, True]))[0] - rep = partial(np.repeat, repeats=num_repeats) - print(f"{rep=}") + + change_ids = np.r_[ # need to update now that we removed full repeats + ids[1:] != ids[:-1], True + ] + if (not dropna) and (-1 in val_lab): + # in this case we need to explicitly add NaN as a level val_lev = np.r_[Index([np.nan]), val_lev] val_lab += 1 + levels = [ping.group_index for ping in self.grouper.groupings] + [ Index(val_lev) ] - print(f"{levels=}") names = self.grouper.names + [self._selection_name] - print(f"{names=}") if normalize: - num_vals = [] - ix = 0 - print(f"{num_repeats=}") - for i, r in enumerate(num_repeats): - num_vals.append(np.sum(cts[ix : ix + r])) - # print(out[ix:ix+r]) - ix += r - # print(f'{ix=}') - # [np.sum(out[i:i+r]) ] - print(f"{num_vals=}") - print(f"{cts=}") cts = cts.astype("float") - cts /= rep(num_vals) # each divisor is the number of repeats for that index - print(f"{cts=}") + cts /= rep( + num_repeats + ) # each divisor is the number of repeats for that index if bins is None: - print("codes: ", codes) - # codes = [code[changes] for code in codes] - used_ids = np.unique(ids) - # codes = [code[used_ids] for code in codes] codes = [rep(level_codes) for level_codes in codes] + [val_lab] - print(f"{codes=}") if sort: indices = tuple(reversed(codes[:-1])) @@ -808,52 +754,36 @@ def value_counts( ) # sorts using right columns first cts = cts[sorter] codes = [code[sorter] for code in codes] - print(f"{cts=}") + mi = MultiIndex( levels=levels, codes=codes, names=names, verify_integrity=False ) - # print(f'{mi=}') if is_integer_dtype(cts): cts = ensure_int64(cts) return self.obj._constructor(cts, index=mi, name=self._selection_name) - nbin = len(levels[-1]) - # print(f'{codes=}') - print(len(cts), len(codes[0]), len(sorter)) - # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros - print(f"{ids=}") + nbin = len(levels[-1]) ncat = len(codes[0]) - # ncat = len(ids) - print(f"{nbin=}") fout = np.zeros((ncat * nbin), dtype=float if normalize else np.int64) - for i, ct in enumerate(cts): - fout[ids[i] * nbin + val_lab[i]] = ct - print(f"{fout=}", len(fout)) - + id = 0 + for i, ct in enumerate(cts): # fill in nonzero values of fout + fout[id * nbin + val_lab[i]] = cts[i] + id += change_ids[i] ncodes = [np.repeat(code, nbin) for code in codes] - print(f"{ncodes=}") ncodes.append(np.tile(range(nbin), len(codes[0]))) - """ - fout = cts - ncodes = [rep(level_codes) for level_codes in codes] + [val_lab] - """ - print(f"{ncodes=}") + if sort: indices = tuple(reversed(ncodes[:-1])) - print(f"{indices=}") - # print(np.r_[[fout if ascending else -fout], indices]) sorter = np.lexsort( np.r_[[fout if ascending else -fout], indices] ) # sorts using right columns first - # print(sorter) fout = fout[sorter] ncodes = [code[sorter] for code in ncodes] mi = MultiIndex( levels=levels, codes=ncodes, names=names, verify_integrity=False ) - print(f"{mi=}") if is_integer_dtype(fout): fout = ensure_int64(fout) return self.obj._constructor(fout, index=mi, name=self._selection_name) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 17e1afe906b0a..558a66952d074 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -191,31 +191,34 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 0 -def test_value_counts_bins_nas(): +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("bins", [None, 3, [0, 1, 3, 6]]) +def test_value_counts_bins_nas(dropna, bins): # GH25970, handle normalizing bins with NA's properly # First test that NA's are included appropriately rand_data = np.append( np.random.randint(1, 5, 50), [np.nan] * np.random.randint(1, 20) ) s = Series(rand_data) - assert s.value_counts(dropna=False).index.hasnans - assert not s.value_counts(dropna=True).index.hasnans - assert s.value_counts(dropna=False, bins=3).index.hasnans - assert not s.value_counts(dropna=True, bins=3).index.hasnans - assert s.value_counts(dropna=False, bins=[0, 1, 3, 6]).index.hasnans - assert not s.value_counts(dropna=True, bins=[0, 1, 3, 6]).index.hasnans - - # then verify specific example + if dropna: + assert not s.value_counts(dropna=dropna, bins=bins).index.hasnans + else: + assert s.value_counts(dropna=dropna, bins=bins).index.hasnans + + +def test_value_counts_bins_specific_na(): + # verify specific NA example s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5]) intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0]) expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2])) - expected_keepna_vals = np.array([0.3, 0.3, 0.2, 0.2]) tm.assert_series_equal( s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna ) - tm.assert_numpy_array_equal( - s2.value_counts(dropna=False, normalize=True, bins=3).values, - expected_keepna_vals, + keys = list(intervals.take([1, 0, 2])) + keys.insert(2, np.nan) + expected_keepna = Series([0.3, 0.3, 0.2, 0.2], keys) + tm.assert_series_equal( + s2.value_counts(dropna=False, normalize=True, bins=3), expected_keepna ) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 6d221fa89de6b..fe084a7a941c6 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas import DataFrame, Grouper, MultiIndex, Series, cut, date_range, to_datetime +from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime import pandas._testing as tm @@ -40,7 +40,7 @@ def seed_df(seed_nans, n, m): binned = [] ids = [] for seed_nans in [True, False]: - for n, m in product((10, 1000), (5, 20)): + for n, m in product((100, 1000), (5, 20)): df = seed_df(seed_nans, n, m) bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) keys = "1st", "2nd", ["1st", "2nd"] @@ -114,19 +114,9 @@ def test_groubpy_value_counts_bins(): ) result.sort_index(inplace=True) - intervals = cut(Series([0]), bins=BINS, include_lowest=True).cat.categories - # groups = [(0,5), (1,5), (2,5), (3,5), (3,6)] - groups = set((v[1], v[2], i) for v in values for i in intervals) - # {val[:-1]: 0 for val in values} - index = product([], intervals) - - """index = MultiIndex.from_product( - [groups, sorted(intervals)], names=("key1", "key2", "score") - )""" expected = Series( [1, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 1], result.index, name="score" ) - # expected = [2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1] tm.assert_series_equal(result, expected) From d2399ea4a7bdf934ff8b6303a34f93ace8aa01a1 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Mon, 27 Jul 2020 12:17:15 -0400 Subject: [PATCH 21/31] speedups 1 --- pandas/core/groupby/generic.py | 13 +++++++------ pandas/tests/groupby/test_value_counts.py | 15 --------------- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 513f47302df40..999729bf4dbab 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -684,6 +684,7 @@ def value_counts( from pandas.core.reshape.tile import cut + """ if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level # in a backward compatible way @@ -694,6 +695,7 @@ def value_counts( ascending=ascending, bins=bins, ) + """ ids, _, _ = self.grouper.group_info val = self.obj._values codes = self.grouper.reconstructed_codes # this will track the groups @@ -735,14 +737,9 @@ def value_counts( val_lab = val_lab[changes] ids = ids[changes] cts = np.diff(np.nonzero(np.r_[changes, True]))[0] - idx = np.r_[0, 1 + np.nonzero(change_ids)[0]] + # how many times each index gets repeated rep = partial(np.repeat, repeats=np.add.reduceat(changes, idx)) - num_repeats = np.diff(np.nonzero(np.r_[True, change_ids, True]))[0] - - change_ids = np.r_[ # need to update now that we removed full repeats - ids[1:] != ids[:-1], True - ] if (not dropna) and (-1 in val_lab): # in this case we need to explicitly add NaN as a level @@ -755,6 +752,7 @@ def value_counts( names = self.grouper.names + [self._selection_name] if normalize: + num_repeats = np.diff(idx, append=len(ids)) cts = cts.astype("float") cts /= rep( num_repeats @@ -784,6 +782,9 @@ def value_counts( ncat = len(codes[0]) fout = np.zeros((ncat * nbin), dtype=float if normalize else np.int64) id = 0 + change_ids = np.r_[ # need to update now that we removed full repeats + ids[1:] != ids[:-1], True + ] for i, ct in enumerate(cts): # fill in nonzero values of fout fout[id * nbin + val_lab[i]] = cts[i] id += change_ids[i] diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index fe084a7a941c6..5b83119f89903 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -68,7 +68,6 @@ def rebuild_index(df): normalize=normalize, sort=sort, ascending=ascending, dropna=dropna, bins=bins ) - print(f"{df=}") gr = df.groupby(keys, sort=isort) left = gr["3rd"].value_counts(**kwargs) left.index.names = left.index.names[:-1] + ["3rd"] @@ -76,8 +75,6 @@ def rebuild_index(df): # gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) right.index.names = right.index.names[:-1] + ["3rd"] - print(f"{left=}") - print(f"{right=}") # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 @@ -101,18 +98,6 @@ def test_groubpy_value_counts_bins(): ] df = DataFrame(values, columns=["key1", "key2", "score"]) result = df.groupby(["key1", "key2"])["score"].value_counts(bins=BINS) - print(f"{result=}") - print( - df.groupby(["key1", "key2"])["score"].apply( - Series.value_counts, - bins=BINS, - sort=True, - normalize=True, - ascending=True, - dropna=True, - ) - ) - result.sort_index(inplace=True) expected = Series( [1, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 1], result.index, name="score" From ec92f15a260c0a72d2df564f9e376b3cb4d02fb7 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 1 Aug 2020 20:00:20 -0400 Subject: [PATCH 22/31] speedup? --- pandas/core/groupby/generic.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 54bb17f4e9d8a..9455b37ce62f7 100755 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -769,13 +769,17 @@ def value_counts( nbin = len(levels[-1]) ncat = len(codes[0]) fout = np.zeros((ncat * nbin), dtype=float if normalize else np.int64) - id = 0 + """ change_ids = np.r_[ # need to update now that we removed full repeats ids[1:] != ids[:-1], True ] + """ + id = 0 + ct_len = len(cts) for i, ct in enumerate(cts): # fill in nonzero values of fout fout[id * nbin + val_lab[i]] = cts[i] - id += change_ids[i] + if i < ct_len - 1: # avoid index error + id += ids[i] != ids[i + 1] ncodes = [np.repeat(code, nbin) for code in codes] ncodes.append(np.tile(range(nbin), len(codes[0]))) From d6179b0510582c8c09945eab6f1c2a868c04fdf7 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 1 Aug 2020 21:25:15 -0400 Subject: [PATCH 23/31] Revert "speedup?" This reverts commit ec92f15a260c0a72d2df564f9e376b3cb4d02fb7. --- pandas/core/groupby/generic.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9455b37ce62f7..54bb17f4e9d8a 100755 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -769,17 +769,13 @@ def value_counts( nbin = len(levels[-1]) ncat = len(codes[0]) fout = np.zeros((ncat * nbin), dtype=float if normalize else np.int64) - """ + id = 0 change_ids = np.r_[ # need to update now that we removed full repeats ids[1:] != ids[:-1], True ] - """ - id = 0 - ct_len = len(cts) for i, ct in enumerate(cts): # fill in nonzero values of fout fout[id * nbin + val_lab[i]] = cts[i] - if i < ct_len - 1: # avoid index error - id += ids[i] != ids[i + 1] + id += change_ids[i] ncodes = [np.repeat(code, nbin) for code in codes] ncodes.append(np.tile(range(nbin), len(codes[0]))) From 5abfb16455be8644ba37906c44a9daec430b20c9 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Mon, 10 Aug 2020 10:48:49 -0400 Subject: [PATCH 24/31] fixed comments --- pandas/core/groupby/generic.py | 2 -- pandas/tests/base/test_value_counts.py | 2 +- pandas/tests/groupby/test_value_counts.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 54bb17f4e9d8a..7c70bf8d98de1 100755 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -726,7 +726,6 @@ def value_counts( ids = ids[changes] cts = np.diff(np.nonzero(np.r_[changes, True]))[0] idx = np.r_[0, 1 + np.nonzero(change_ids)[0]] - print(idx) # how many times each index gets repeated rep = partial(np.repeat, repeats=np.add.reduceat(changes, idx)) @@ -742,7 +741,6 @@ def value_counts( if normalize: num_repeats = np.diff(idx, append=len(change_ids) + 1) - print(num_repeats) cts = cts.astype("float") / rep(num_repeats) # each divisor is the number of repeats for that index diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 558a66952d074..8ea7f0fe3fc98 100755 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -207,7 +207,7 @@ def test_value_counts_bins_nas(dropna, bins): def test_value_counts_bins_specific_na(): - # verify specific NA example + # GH25970 case where proportions were incorrect for dropna and normalize=True s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5]) intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0]) expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2])) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 5b83119f89903..94e19b93368d8 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -72,12 +72,12 @@ def rebuild_index(df): left = gr["3rd"].value_counts(**kwargs) left.index.names = left.index.names[:-1] + ["3rd"] - # gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) right.index.names = right.index.names[:-1] + ["3rd"] # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 + # have to ignore 0 counts to be consistent with individual column value_counts left = left[left.astype(bool)] right = right[right.astype(bool)] From 5f33834181831f5061e1e55b481457a1eb762ef1 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 12 Sep 2020 08:09:45 -0400 Subject: [PATCH 25/31] removed unneeded import --- pandas/core/groupby/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b9868e1538c66..2a67dfaa41c73 100755 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -685,7 +685,6 @@ def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - from pandas.core.reshape.merge import _get_join_indexers from pandas.core.reshape.tile import cut if bins is not None and not np.iterable(bins): From f685cb2bec8dce856d83f2b1b1ddf5b0425b306c Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 12 Sep 2020 08:32:03 -0400 Subject: [PATCH 26/31] updated to use na_sentinal param --- pandas/core/groupby/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a3e7bf38fcd3f..fff8ce6da2c87 100755 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -712,7 +712,9 @@ def value_counts( ids, val = ids[mask], val[mask] if bins is None: - val_lab, val_lev = algorithms.factorize(val, sort=True, dropna=dropna) + val_lab, val_lev = algorithms.factorize( + val, sort=True, na_sentinel=(None if dropna else -1) + ) else: # val_lab is a Categorical with categories an IntervalIndex val_lab = cut(Series(val), bins, include_lowest=True) From c21bdbb3a767efdae7dfd91a9395da799aa35f06 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sat, 12 Sep 2020 09:31:52 -0400 Subject: [PATCH 27/31] fixed bad test --- pandas/tests/series/methods/test_value_counts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index d965fe4faa075..270f29914442a 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -182,8 +182,7 @@ def test_value_counts_categorical_with_nan(self): def test_value_counts_interval_bins(self): ser = Series([1, 2, 3, 0, 1, 4], ["a", "a", "a", "b", "b", "c"]) res = ser.value_counts(bins=[0, 1, 2]) - print(res) - exp = Series([2, 3, 4]) + exp = Series([3, 1], res.index) tm.assert_series_equal(res, exp) @pytest.mark.parametrize( From e4c255213593e75be45e185944239cfce1996072 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sun, 13 Sep 2020 12:02:39 -0400 Subject: [PATCH 28/31] moved doc, reverted permissions --- doc/source/whatsnew/v1.0.3.rst | 1 - doc/source/whatsnew/v1.1.0.rst | 3 +-- doc/source/whatsnew/v1.2.0.rst | 4 +++- 3 files changed, 4 insertions(+), 4 deletions(-) mode change 100755 => 100644 doc/source/whatsnew/v1.0.3.rst diff --git a/doc/source/whatsnew/v1.0.3.rst b/doc/source/whatsnew/v1.0.3.rst old mode 100755 new mode 100644 index cc08329934897..62e6ae5b1c5cc --- a/doc/source/whatsnew/v1.0.3.rst +++ b/doc/source/whatsnew/v1.0.3.rst @@ -23,7 +23,6 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Contributors ~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f88a788aa03cf..7529c73aa14f2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -876,8 +876,7 @@ Performance improvements Bug fixes ~~~~~~~~~ -Fixed Series.value_counts so that normalize excludes NA values when dropna=False. (:issue:`25970`) -Fixed Dataframe Groupby value_counts with bins (:issue:`32471`) + Categorical ^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bce6a735b7b07..4052046125baa 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -215,6 +215,7 @@ Performance improvements Bug fixes ~~~~~~~~~ + Categorical ^^^^^^^^^^^ @@ -315,7 +316,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- +- Fixed Series.value_counts so that normalize excludes NA values when dropna=False. (:issue:`25970`) +- Fixed Dataframe Groupby value_counts with bins (:issue:`32471`) Reshaping ^^^^^^^^^ From 74b13d8c8201807fa1b121e3cfccabdf125c3c38 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sun, 13 Sep 2020 12:18:28 -0400 Subject: [PATCH 29/31] more doc and permission fix --- doc/source/whatsnew/v1.1.0.rst | 1 - doc/source/whatsnew/v1.2.0.rst | 3 +-- pandas/core/algorithms.py | 2 +- pandas/core/groupby/generic.py | 5 ++--- 4 files changed, 4 insertions(+), 7 deletions(-) mode change 100755 => 100644 pandas/core/algorithms.py mode change 100755 => 100644 pandas/core/groupby/generic.py diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7529c73aa14f2..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -955,7 +955,6 @@ Numeric - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) - Bug in arithmetic operations between :class:`DataFrame` objects with non-overlapping columns with duplicate labels causing an infinite loop (:issue:`35194`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) -- Bug in :meth:`Series.value_counts` with ``normalize=True`` for NA values (:issue:`25970`) - Bug in :meth:`Index.difference` giving incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`) - Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`) - :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raise a ValueError if ``limit_direction`` is ``'forward'`` or ``'both'`` and ``method`` is ``'backfill'`` or ``'bfill'`` or ``limit_direction`` is ``'backward'`` or ``'both'`` and ``method`` is ``'pad'`` or ``'ffill'`` (:issue:`34746`) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 4052046125baa..98304a9fcbe26 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -215,7 +215,6 @@ Performance improvements Bug fixes ~~~~~~~~~ - Categorical ^^^^^^^^^^^ @@ -248,6 +247,7 @@ Numeric ^^^^^^^ - Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) +- Fixed Series.value_counts so that normalize excludes NA values when dropna=False. (:issue:`25970`) - Conversion @@ -316,7 +316,6 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- Fixed Series.value_counts so that normalize excludes NA values when dropna=False. (:issue:`25970`) - Fixed Dataframe Groupby value_counts with bins (:issue:`32471`) Reshaping diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py old mode 100755 new mode 100644 index dedcbcd8c8a21..532427dd69782 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -758,7 +758,7 @@ def value_counts( if is_extension_array_dtype(values): - # handle Categorical and sparse, + # handle Categorical and sparse data, result = Series(values)._values.value_counts(dropna=dropna) result.name = name diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py old mode 100755 new mode 100644 index fff8ce6da2c87..cff9a24bd1540 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -728,9 +728,8 @@ def value_counts( sorter = np.lexsort((val_lab, ids)) ids, val_lab = ids[sorter], val_lab[sorter] used_ids = unique(ids) - if max(used_ids) >= len( - codes[0] - ): # this means we had something skipped from the start + if max(used_ids) >= len(codes[0]): + # this means we had something skipped from the start used_ids = compress_group_index(used_ids)[0] codes = [code[used_ids] for code in codes] # drop what was taken out for n/a From f0e630a4b0788a266c2a7187c85bd021d7a56071 Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sun, 13 Sep 2020 22:04:57 -0400 Subject: [PATCH 30/31] fixed docstrings --- doc/source/whatsnew/v1.0.0.rst | 0 doc/source/whatsnew/v1.2.0.rst | 6 +++--- pandas/core/algorithms.py | 3 ++- pandas/core/base.py | 5 +++-- 4 files changed, 8 insertions(+), 6 deletions(-) mode change 100755 => 100644 doc/source/whatsnew/v1.0.0.rst diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst old mode 100755 new mode 100644 diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 98304a9fcbe26..fc2aaba6e9c0c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -247,8 +247,8 @@ Numeric ^^^^^^^ - Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) -- Fixed Series.value_counts so that normalize excludes NA values when dropna=False. (:issue:`25970`) -- +- Bug in :meth:`Series.value_counts` with ``dropna=True`` and ``normalize=True`` where value counts did not sum to 1. (:issue:`25970`) + Conversion ^^^^^^^^^^ @@ -316,7 +316,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- Fixed Dataframe Groupby value_counts with bins (:issue:`32471`) +- Bug in :meth:`DataframeGroupBy.value_counts` outputs wrong index labels with bins (:issue:`32471`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 532427dd69782..5e2944c80de92 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -725,11 +725,12 @@ def value_counts( bins : integer or iterable of numeric, optional Rather than count values, group them into half-open bins. Only works with numeric data. - If int, interpreted as number of bins and will use pd.cut. + If int, interpreted as number of bins. If interable of numeric, will use provided numbers as bin endpoints. dropna : bool, default True Don't include counts of NaN. If False and NaNs are present, NaN will be a key in the output. + .. versionchanged:: 1.2 Returns ------- diff --git a/pandas/core/base.py b/pandas/core/base.py index 68454f50def09..55ca1259bb188 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1182,11 +1182,12 @@ def value_counts( bins : integer or iterable of numeric, optional Rather than count individual values, group them into half-open bins. Only works with numeric data. - If int, interpreted as number of bins and will use `pd.cut`. + If int, interpreted as number of bins. If interable of numeric, will use provided numbers as bin endpoints. dropna : bool, default True Don't include counts of NaN. If False and NaNs are present, NaN will be a key in the output. + .. versionchanged:: 1.1.2 Returns ------- @@ -1237,7 +1238,7 @@ def value_counts( Bins can also be an iterable of numbers. These numbers are treated as endpoints for the intervals. - >>> s.value_counts(bins=[0,2,4,9]) + >>> s.value_counts(bins=[0, 2, 4, 9]) (2.0, 4.0] 3 (-0.001, 2.0] 2 (4.0, 9.0] 0 From 9763e83b02995d706f967923b231b0776f09f60c Mon Sep 17 00:00:00 2001 From: DataInformer Date: Sun, 13 Sep 2020 22:10:19 -0400 Subject: [PATCH 31/31] file perm --- doc/source/whatsnew/v1.0.0.rst | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 doc/source/whatsnew/v1.0.0.rst diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst old mode 100644 new mode 100755