diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bce6a735b7b07..fc2aaba6e9c0c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -247,7 +247,8 @@ Numeric ^^^^^^^ - Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) -- +- Bug in :meth:`Series.value_counts` with ``dropna=True`` and ``normalize=True`` where value counts did not sum to 1. (:issue:`25970`) + Conversion ^^^^^^^^^^ @@ -315,7 +316,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- +- Bug in :meth:`DataframeGroupBy.value_counts` outputs wrong index labels with bins (:issue:`32471`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 872c51c7dfa75..5e2944c80de92 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -720,17 +720,23 @@ def value_counts( ascending : bool, default False Sort in ascending order normalize: bool, default False - If True then compute a relative histogram - bins : integer, optional - Rather than count values, group them into half-open bins, - convenience for pd.cut, only works with numeric data + If True, then compute a relative histogram that outputs the + proportion of each value. + bins : integer or iterable of numeric, optional + Rather than count values, group them into half-open bins. + Only works with numeric data. + If int, interpreted as number of bins. + If interable of numeric, will use provided numbers as bin endpoints. dropna : bool, default True - Don't include counts of NaN + Don't include counts of NaN. + If False and NaNs are present, NaN will be a key in the output. + .. versionchanged:: 1.2 Returns ------- Series """ + from pandas.core.series import Series name = getattr(values, "name", None) @@ -744,39 +750,30 @@ def value_counts( except TypeError as err: raise TypeError("bins argument only works with numeric data.") from err - # count, remove nulls (from the index), and but the bins + # count, remove nulls (from the index), and use the bins result = ii.value_counts(dropna=dropna) - result = result[result.index.notna()] result.index = result.index.astype("interval") result = result.sort_index() - # if we are dropna and we have NO values - if dropna and (result._values == 0).all(): - result = result.iloc[0:0] - - # normalizing is by len of all (regardless of dropna) - counts = np.array([len(ii)]) - else: if is_extension_array_dtype(values): - # handle Categorical and sparse, + # handle Categorical and sparse data, result = Series(values)._values.value_counts(dropna=dropna) result.name = name - counts = result._values else: keys, counts = value_counts_arraylike(values, dropna) result = Series(counts, index=keys, name=name) - if sort: - result = result.sort_values(ascending=ascending) - if normalize: - result = result / float(counts.sum()) + counts = result._values + result = result / float(max(counts.sum(), 1)) + if sort: + result = result.sort_values(ascending=ascending) return result diff --git a/pandas/core/base.py b/pandas/core/base.py index 1926803d8f04b..55ca1259bb188 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1174,17 +1174,20 @@ def value_counts( Parameters ---------- normalize : bool, default False - If True then the object returned will contain the relative - frequencies of the unique values. + If True, outputs the relative frequencies of the unique values. sort : bool, default True Sort by frequencies. ascending : bool, default False Sort in ascending order. - bins : int, optional - Rather than count values, group them into half-open bins, - a convenience for ``pd.cut``, only works with numeric data. + bins : integer or iterable of numeric, optional + Rather than count individual values, group them into half-open bins. + Only works with numeric data. + If int, interpreted as number of bins. + If interable of numeric, will use provided numbers as bin endpoints. dropna : bool, default True Don't include counts of NaN. + If False and NaNs are present, NaN will be a key in the output. + .. versionchanged:: 1.1.2 Returns ------- @@ -1221,8 +1224,10 @@ def value_counts( Bins can be useful for going from a continuous variable to a categorical variable; instead of counting unique - apparitions of values, divide the index in the specified - number of half-open bins. + instances of values, count the number of values that fall + into half-open intervals. + + Bins can be an int. >>> s.value_counts(bins=3) (2.0, 3.0] 2 @@ -1230,6 +1235,15 @@ def value_counts( (3.0, 4.0] 1 dtype: int64 + Bins can also be an iterable of numbers. These numbers are treated + as endpoints for the intervals. + + >>> s.value_counts(bins=[0, 2, 4, 9]) + (2.0, 4.0] 3 + (-0.001, 2.0] 2 + (4.0, 9.0] 0 + dtype: int64 + **dropna** With `dropna` set to `False` we can also see NaN index values. @@ -1242,6 +1256,7 @@ def value_counts( 1.0 1 dtype: int64 """ + result = value_counts( self, sort=sort, diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e870187fc7952..cff9a24bd1540 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -45,7 +45,6 @@ ensure_platform_int, is_bool, is_integer_dtype, - is_interval_dtype, is_numeric_dtype, is_object_dtype, is_scalar, @@ -59,6 +58,7 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms +from pandas.core.algorithms import unique from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com @@ -79,6 +79,7 @@ import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager from pandas.core.series import Series +from pandas.core.sorting import compress_group_index from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.plotting import boxplot_frame_groupby @@ -685,7 +686,6 @@ def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut if bins is not None and not np.iterable(bins): @@ -701,111 +701,111 @@ def value_counts( ids, _, _ = self.grouper.group_info val = self.obj._values + codes = self.grouper.reconstructed_codes # this will track the groups # groupby removes null keys from groupings mask = ids != -1 ids, val = ids[mask], val[mask] + if dropna: + mask = ~isna(val) + if not mask.all(): + ids, val = ids[mask], val[mask] if bins is None: - lab, lev = algorithms.factorize(val, sort=True) - llab = lambda lab, inc: lab[inc] + val_lab, val_lev = algorithms.factorize( + val, sort=True, na_sentinel=(None if dropna else -1) + ) else: + # val_lab is a Categorical with categories an IntervalIndex + val_lab = cut(Series(val), bins, include_lowest=True) + val_lev = val_lab.cat.categories + val_lab = val_lab.cat.codes.values - # lab is a Categorical with categories an IntervalIndex - lab = cut(Series(val), bins, include_lowest=True) - lev = lab.cat.categories - lab = lev.take(lab.cat.codes) - llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] - - if is_interval_dtype(lab.dtype): - # TODO: should we do this inside II? - sorter = np.lexsort((lab.left, lab.right, ids)) - else: - sorter = np.lexsort((lab, ids)) + if dropna: + included = val_lab != -1 + ids, val_lab = ids[included], val_lab[included] - ids, lab = ids[sorter], lab[sorter] + sorter = np.lexsort((val_lab, ids)) + ids, val_lab = ids[sorter], val_lab[sorter] + used_ids = unique(ids) + if max(used_ids) >= len(codes[0]): + # this means we had something skipped from the start + used_ids = compress_group_index(used_ids)[0] + codes = [code[used_ids] for code in codes] # drop what was taken out for n/a # group boundaries are where group ids change - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - # new values are where sorted labels change - lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) - inc = np.r_[True, lchanges] - inc[idx] = True # group boundaries are also new values - out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts - - # num. of times each group should be repeated - rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) - - # multi-index components - codes = self.grouper.reconstructed_codes - codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + change_ids = ids[1:] != ids[:-1] + changes = np.logical_or(change_ids, (val_lab[1:] != val_lab[:-1])) + changes = np.r_[True, changes] + val_lab = val_lab[changes] + ids = ids[changes] + cts = np.diff(np.nonzero(np.r_[changes, True]))[0] + idx = np.r_[0, 1 + np.nonzero(change_ids)[0]] + # how many times each index gets repeated + rep = partial(np.repeat, repeats=np.add.reduceat(changes, idx)) + + if (not dropna) and (-1 in val_lab): + # in this case we need to explicitly add NaN as a level + val_lev = np.r_[Index([np.nan]), val_lev] + val_lab += 1 + + levels = [ping.group_index for ping in self.grouper.groupings] + [ + Index(val_lev) + ] names = self.grouper.names + [self._selection_name] - if dropna: - mask = codes[-1] != -1 - if mask.all(): - dropna = False - else: - out, codes = out[mask], [level_codes[mask] for level_codes in codes] - if normalize: - out = out.astype("float") - d = np.diff(np.r_[idx, len(ids)]) - if dropna: - m = ids[lab == -1] - np.add.at(d, m, -1) - acc = rep(d)[mask] - else: - acc = rep(d) - out /= acc - - if sort and bins is None: - cat = ids[inc][mask] if dropna else ids[inc] - sorter = np.lexsort((out if ascending else -out, cat)) - out, codes[-1] = out[sorter], codes[-1][sorter] + num_repeats = np.diff(idx, append=len(change_ids) + 1) + cts = cts.astype("float") / rep(num_repeats) + # each divisor is the number of repeats for that index if bins is None: + codes = [rep(level_codes) for level_codes in codes] + [val_lab] + + if sort: + indices = tuple(reversed(codes[:-1])) + sorter = np.lexsort( + np.r_[[val_lab], [cts if ascending else -cts], indices] + ) # sorts using right columns first + cts = cts[sorter] + codes = [code[sorter] for code in codes] + mi = MultiIndex( levels=levels, codes=codes, names=names, verify_integrity=False ) - - if is_integer_dtype(out): - out = ensure_int64(out) - return self.obj._constructor(out, index=mi, name=self._selection_name) + if is_integer_dtype(cts): + cts = ensure_int64(cts) + return self.obj._constructor(cts, index=mi, name=self._selection_name) # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros - diff = np.zeros(len(out), dtype="bool") - for level_codes in codes[:-1]: - diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] - - ncat, nbin = diff.sum(), len(levels[-1]) - - left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] - - right = [diff.cumsum() - 1, codes[-1]] - - _, idx = get_join_indexers(left, right, sort=False, how="left") - out = np.where(idx != -1, out[idx], 0) + nbin = len(levels[-1]) + ncat = len(codes[0]) + fout = np.zeros((ncat * nbin), dtype=float if normalize else np.int64) + id = 0 + change_ids = np.r_[ # need to update now that we removed full repeats + ids[1:] != ids[:-1], True + ] + for i, ct in enumerate(cts): # fill in nonzero values of fout + fout[id * nbin + val_lab[i]] = cts[i] + id += change_ids[i] + ncodes = [np.repeat(code, nbin) for code in codes] + ncodes.append(np.tile(range(nbin), len(codes[0]))) if sort: - sorter = np.lexsort((out if ascending else -out, left[0])) - out, left[-1] = out[sorter], left[-1][sorter] - - # build the multi-index w/ full levels - def build_codes(lev_codes: np.ndarray) -> np.ndarray: - return np.repeat(lev_codes[diff], nbin) - - codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] - codes.append(left[-1]) - - mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) - - if is_integer_dtype(out): - out = ensure_int64(out) - return self.obj._constructor(out, index=mi, name=self._selection_name) + indices = tuple(reversed(ncodes[:-1])) + sorter = np.lexsort( + np.r_[[fout if ascending else -fout], indices] + ) # sorts using right columns first + fout = fout[sorter] + ncodes = [code[sorter] for code in ncodes] + mi = MultiIndex( + levels=levels, codes=ncodes, names=names, verify_integrity=False + ) + if is_integer_dtype(fout): + fout = ensure_int64(fout) + return self.obj._constructor(fout, index=mi, name=self._selection_name) def count(self) -> Series: """ diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py old mode 100644 new mode 100755 index de04c30432e6f..8ea7f0fe3fc98 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -191,6 +191,37 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 0 +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize("bins", [None, 3, [0, 1, 3, 6]]) +def test_value_counts_bins_nas(dropna, bins): + # GH25970, handle normalizing bins with NA's properly + # First test that NA's are included appropriately + rand_data = np.append( + np.random.randint(1, 5, 50), [np.nan] * np.random.randint(1, 20) + ) + s = Series(rand_data) + if dropna: + assert not s.value_counts(dropna=dropna, bins=bins).index.hasnans + else: + assert s.value_counts(dropna=dropna, bins=bins).index.hasnans + + +def test_value_counts_bins_specific_na(): + # GH25970 case where proportions were incorrect for dropna and normalize=True + s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5]) + intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0]) + expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2])) + tm.assert_series_equal( + s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna + ) + keys = list(intervals.take([1, 0, 2])) + keys.insert(2, np.nan) + expected_keepna = Series([0.3, 0.3, 0.2, 0.2], keys) + tm.assert_series_equal( + s2.value_counts(dropna=False, normalize=True, bins=3), expected_keepna + ) + + def test_value_counts_datetime64(index_or_series): klass = index_or_series diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index c86cb4532bc26..94e19b93368d8 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -41,13 +41,12 @@ def seed_df(seed_nans, n, m): ids = [] for seed_nans in [True, False]: for n, m in product((100, 1000), (5, 20)): - df = seed_df(seed_nans, n, m) bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) keys = "1st", "2nd", ["1st", "2nd"] for k, b in product(keys, bins): binned.append((df, k, b, n, m)) - ids.append(f"{k}-{n}-{m}") + ids.append(f"{k}-{n}-{m}-{seed_nans} ") @pytest.mark.slow @@ -71,16 +70,41 @@ def rebuild_index(df): gr = df.groupby(keys, sort=isort) left = gr["3rd"].value_counts(**kwargs) + left.index.names = left.index.names[:-1] + ["3rd"] - gr = df.groupby(keys, sort=isort) right = gr["3rd"].apply(Series.value_counts, **kwargs) right.index.names = right.index.names[:-1] + ["3rd"] # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 + + # have to ignore 0 counts to be consistent with individual column value_counts + left = left[left.astype(bool)] + right = right[right.astype(bool)] tm.assert_series_equal(left.sort_index(), right.sort_index()) +def test_groubpy_value_counts_bins(): + # GH32471 + BINS = [0, 20, 80, 100] + values = [ + [0, 5, 0], + [1, 5, 100], + [0, 5, 100], + [2, 5, 0], + [3, 6, 100], + [3, 5, 100], + [1, 5, 100], + ] + df = DataFrame(values, columns=["key1", "key2", "score"]) + result = df.groupby(["key1", "key2"])["score"].value_counts(bins=BINS) + result.sort_index(inplace=True) + expected = Series( + [1, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 1], result.index, name="score" + ) + tm.assert_series_equal(result, expected) + + def test_series_groupby_value_counts_with_grouper(): # GH28479 df = DataFrame( diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index f97362ce9c2a9..270f29914442a 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -179,6 +179,12 @@ def test_value_counts_categorical_with_nan(self): res = ser.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) + def test_value_counts_interval_bins(self): + ser = Series([1, 2, 3, 0, 1, 4], ["a", "a", "a", "b", "b", "c"]) + res = ser.value_counts(bins=[0, 1, 2]) + exp = Series([3, 1], res.index) + tm.assert_series_equal(res, exp) + @pytest.mark.parametrize( "ser, dropna, exp", [