-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Value counts normalize #33652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Value counts normalize #33652
Changes from all commits
bd9011a
d9d5ec1
86fe7f9
c34a863
9c1c269
5f8eb1d
1276166
a1b7197
9c3ede3
0cff92b
27aa460
27c9856
f5e9aeb
99b7112
75374b2
25b6c14
277ce52
73ef54b
6b97e0b
797f668
fce6998
637a609
c9a4383
7ae1280
d2399ea
3299a36
ec92f15
d6179b0
5abfb16
83ccfd2
5f33834
8562f1b
f685cb2
c21bdbb
e4c2552
74b13d8
f0e630a
9763e83
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1174,17 +1174,20 @@ def value_counts( | |
Parameters | ||
---------- | ||
normalize : bool, default False | ||
If True then the object returned will contain the relative | ||
frequencies of the unique values. | ||
If True, outputs the relative frequencies of the unique values. | ||
sort : bool, default True | ||
Sort by frequencies. | ||
ascending : bool, default False | ||
Sort in ascending order. | ||
bins : int, optional | ||
Rather than count values, group them into half-open bins, | ||
a convenience for ``pd.cut``, only works with numeric data. | ||
bins : integer or iterable of numeric, optional | ||
Rather than count individual values, group them into half-open bins. | ||
Only works with numeric data. | ||
If int, interpreted as number of bins. | ||
If interable of numeric, will use provided numbers as bin endpoints. | ||
dropna : bool, default True | ||
Don't include counts of NaN. | ||
If False and NaNs are present, NaN will be a key in the output. | ||
.. versionchanged:: 1.1.2 | ||
|
||
Returns | ||
------- | ||
|
@@ -1221,15 +1224,26 @@ def value_counts( | |
|
||
Bins can be useful for going from a continuous variable to a | ||
categorical variable; instead of counting unique | ||
apparitions of values, divide the index in the specified | ||
number of half-open bins. | ||
instances of values, count the number of values that fall | ||
into half-open intervals. | ||
|
||
Bins can be an int. | ||
|
||
>>> s.value_counts(bins=3) | ||
(2.0, 3.0] 2 | ||
(0.996, 2.0] 2 | ||
(3.0, 4.0] 1 | ||
dtype: int64 | ||
|
||
Bins can also be an iterable of numbers. These numbers are treated | ||
as endpoints for the intervals. | ||
|
||
>>> s.value_counts(bins=[0, 2, 4, 9]) | ||
(2.0, 4.0] 3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a space missing here? |
||
(-0.001, 2.0] 2 | ||
(4.0, 9.0] 0 | ||
dtype: int64 | ||
|
||
**dropna** | ||
|
||
With `dropna` set to `False` we can also see NaN index values. | ||
|
@@ -1242,6 +1256,7 @@ def value_counts( | |
1.0 1 | ||
dtype: int64 | ||
""" | ||
|
||
result = value_counts( | ||
self, | ||
sort=sort, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,7 +45,6 @@ | |
ensure_platform_int, | ||
is_bool, | ||
is_integer_dtype, | ||
is_interval_dtype, | ||
is_numeric_dtype, | ||
is_object_dtype, | ||
is_scalar, | ||
|
@@ -59,6 +58,7 @@ | |
validate_func_kwargs, | ||
) | ||
import pandas.core.algorithms as algorithms | ||
from pandas.core.algorithms import unique | ||
from pandas.core.arrays import ExtensionArray | ||
from pandas.core.base import DataError, SpecificationError | ||
import pandas.core.common as com | ||
|
@@ -79,6 +79,7 @@ | |
import pandas.core.indexes.base as ibase | ||
from pandas.core.internals import BlockManager | ||
from pandas.core.series import Series | ||
from pandas.core.sorting import compress_group_index | ||
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba | ||
|
||
from pandas.plotting import boxplot_frame_groupby | ||
|
@@ -685,7 +686,6 @@ def value_counts( | |
self, normalize=False, sort=True, ascending=False, bins=None, dropna=True | ||
): | ||
|
||
from pandas.core.reshape.merge import get_join_indexers | ||
from pandas.core.reshape.tile import cut | ||
|
||
if bins is not None and not np.iterable(bins): | ||
|
@@ -701,111 +701,111 @@ def value_counts( | |
|
||
ids, _, _ = self.grouper.group_info | ||
val = self.obj._values | ||
codes = self.grouper.reconstructed_codes # this will track the groups | ||
|
||
# groupby removes null keys from groupings | ||
mask = ids != -1 | ||
ids, val = ids[mask], val[mask] | ||
if dropna: | ||
mask = ~isna(val) | ||
if not mask.all(): | ||
ids, val = ids[mask], val[mask] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. codecov reports no testing here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Most features are tested in test_series_groupby_value_counts, which is parameterized and includes dropna as a parameter. That should address the below case as well (since it includes seed_nans in the dataframe generation. I could add specific tests for these cases, but it seems like they are covered in the existing tests. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @simonjayhawkins IIUC codecov gets results from the travis-37-cov build, which runs with |
||
|
||
if bins is None: | ||
lab, lev = algorithms.factorize(val, sort=True) | ||
llab = lambda lab, inc: lab[inc] | ||
val_lab, val_lev = algorithms.factorize( | ||
val, sort=True, na_sentinel=(None if dropna else -1) | ||
) | ||
else: | ||
# val_lab is a Categorical with categories an IntervalIndex | ||
val_lab = cut(Series(val), bins, include_lowest=True) | ||
val_lev = val_lab.cat.categories | ||
val_lab = val_lab.cat.codes.values | ||
|
||
# lab is a Categorical with categories an IntervalIndex | ||
lab = cut(Series(val), bins, include_lowest=True) | ||
lev = lab.cat.categories | ||
lab = lev.take(lab.cat.codes) | ||
llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] | ||
|
||
if is_interval_dtype(lab.dtype): | ||
# TODO: should we do this inside II? | ||
sorter = np.lexsort((lab.left, lab.right, ids)) | ||
else: | ||
sorter = np.lexsort((lab, ids)) | ||
if dropna: | ||
included = val_lab != -1 | ||
ids, val_lab = ids[included], val_lab[included] | ||
|
||
ids, lab = ids[sorter], lab[sorter] | ||
sorter = np.lexsort((val_lab, ids)) | ||
ids, val_lab = ids[sorter], val_lab[sorter] | ||
used_ids = unique(ids) | ||
if max(used_ids) >= len(codes[0]): | ||
# this means we had something skipped from the start | ||
used_ids = compress_group_index(used_ids)[0] | ||
codes = [code[used_ids] for code in codes] # drop what was taken out for n/a | ||
|
||
# group boundaries are where group ids change | ||
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] | ||
|
||
# new values are where sorted labels change | ||
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) | ||
inc = np.r_[True, lchanges] | ||
inc[idx] = True # group boundaries are also new values | ||
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts | ||
|
||
# num. of times each group should be repeated | ||
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) | ||
|
||
# multi-index components | ||
codes = self.grouper.reconstructed_codes | ||
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] | ||
levels = [ping.group_index for ping in self.grouper.groupings] + [lev] | ||
change_ids = ids[1:] != ids[:-1] | ||
changes = np.logical_or(change_ids, (val_lab[1:] != val_lab[:-1])) | ||
changes = np.r_[True, changes] | ||
val_lab = val_lab[changes] | ||
ids = ids[changes] | ||
cts = np.diff(np.nonzero(np.r_[changes, True]))[0] | ||
idx = np.r_[0, 1 + np.nonzero(change_ids)[0]] | ||
# how many times each index gets repeated | ||
rep = partial(np.repeat, repeats=np.add.reduceat(changes, idx)) | ||
|
||
if (not dropna) and (-1 in val_lab): | ||
# in this case we need to explicitly add NaN as a level | ||
val_lev = np.r_[Index([np.nan]), val_lev] | ||
val_lab += 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. codecov reports no testing for this |
||
|
||
levels = [ping.group_index for ping in self.grouper.groupings] + [ | ||
Index(val_lev) | ||
] | ||
names = self.grouper.names + [self._selection_name] | ||
|
||
if dropna: | ||
mask = codes[-1] != -1 | ||
if mask.all(): | ||
dropna = False | ||
else: | ||
out, codes = out[mask], [level_codes[mask] for level_codes in codes] | ||
|
||
if normalize: | ||
out = out.astype("float") | ||
d = np.diff(np.r_[idx, len(ids)]) | ||
if dropna: | ||
m = ids[lab == -1] | ||
np.add.at(d, m, -1) | ||
acc = rep(d)[mask] | ||
else: | ||
acc = rep(d) | ||
out /= acc | ||
|
||
if sort and bins is None: | ||
cat = ids[inc][mask] if dropna else ids[inc] | ||
sorter = np.lexsort((out if ascending else -out, cat)) | ||
out, codes[-1] = out[sorter], codes[-1][sorter] | ||
num_repeats = np.diff(idx, append=len(change_ids) + 1) | ||
cts = cts.astype("float") / rep(num_repeats) | ||
# each divisor is the number of repeats for that index | ||
|
||
if bins is None: | ||
codes = [rep(level_codes) for level_codes in codes] + [val_lab] | ||
|
||
if sort: | ||
indices = tuple(reversed(codes[:-1])) | ||
sorter = np.lexsort( | ||
np.r_[[val_lab], [cts if ascending else -cts], indices] | ||
) # sorts using right columns first | ||
cts = cts[sorter] | ||
codes = [code[sorter] for code in codes] | ||
|
||
mi = MultiIndex( | ||
levels=levels, codes=codes, names=names, verify_integrity=False | ||
) | ||
|
||
if is_integer_dtype(out): | ||
out = ensure_int64(out) | ||
return self.obj._constructor(out, index=mi, name=self._selection_name) | ||
if is_integer_dtype(cts): | ||
cts = ensure_int64(cts) | ||
return self.obj._constructor(cts, index=mi, name=self._selection_name) | ||
|
||
# for compat. with libgroupby.value_counts need to ensure every | ||
# bin is present at every index level, null filled with zeros | ||
diff = np.zeros(len(out), dtype="bool") | ||
for level_codes in codes[:-1]: | ||
diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] | ||
|
||
ncat, nbin = diff.sum(), len(levels[-1]) | ||
|
||
left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] | ||
|
||
right = [diff.cumsum() - 1, codes[-1]] | ||
|
||
_, idx = get_join_indexers(left, right, sort=False, how="left") | ||
out = np.where(idx != -1, out[idx], 0) | ||
nbin = len(levels[-1]) | ||
ncat = len(codes[0]) | ||
fout = np.zeros((ncat * nbin), dtype=float if normalize else np.int64) | ||
id = 0 | ||
change_ids = np.r_[ # need to update now that we removed full repeats | ||
ids[1:] != ids[:-1], True | ||
] | ||
for i, ct in enumerate(cts): # fill in nonzero values of fout | ||
fout[id * nbin + val_lab[i]] = cts[i] | ||
id += change_ids[i] | ||
ncodes = [np.repeat(code, nbin) for code in codes] | ||
ncodes.append(np.tile(range(nbin), len(codes[0]))) | ||
|
||
if sort: | ||
sorter = np.lexsort((out if ascending else -out, left[0])) | ||
out, left[-1] = out[sorter], left[-1][sorter] | ||
|
||
# build the multi-index w/ full levels | ||
def build_codes(lev_codes: np.ndarray) -> np.ndarray: | ||
return np.repeat(lev_codes[diff], nbin) | ||
|
||
codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] | ||
codes.append(left[-1]) | ||
|
||
mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) | ||
|
||
if is_integer_dtype(out): | ||
out = ensure_int64(out) | ||
return self.obj._constructor(out, index=mi, name=self._selection_name) | ||
indices = tuple(reversed(ncodes[:-1])) | ||
sorter = np.lexsort( | ||
np.r_[[fout if ascending else -fout], indices] | ||
) # sorts using right columns first | ||
fout = fout[sorter] | ||
ncodes = [code[sorter] for code in ncodes] | ||
mi = MultiIndex( | ||
levels=levels, codes=ncodes, names=names, verify_integrity=False | ||
) | ||
if is_integer_dtype(fout): | ||
fout = ensure_int64(fout) | ||
return self.obj._constructor(fout, index=mi, name=self._selection_name) | ||
|
||
def count(self) -> Series: | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it looks like part of this PR is about
normalize
and another part is aboutdropna
. Could these be split into independent pieces?