-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
TST: Split and simplify test_value_counts_unique_nunique #32281
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c6ef5bd
5e1ff44
3abe0c1
f2a4ac6
e9e4489
6f13349
411a9a0
25eb1cc
ee94281
b54b19e
07df52a
ad541b2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
import collections | ||
from datetime import datetime, timedelta | ||
from io import StringIO | ||
import sys | ||
|
@@ -15,7 +16,6 @@ | |
is_datetime64_dtype, | ||
is_datetime64tz_dtype, | ||
is_object_dtype, | ||
is_period_dtype, | ||
needs_i8_conversion, | ||
) | ||
|
||
|
@@ -26,11 +26,9 @@ | |
Index, | ||
Interval, | ||
IntervalIndex, | ||
PeriodIndex, | ||
Series, | ||
Timedelta, | ||
TimedeltaIndex, | ||
Timestamp, | ||
) | ||
import pandas._testing as tm | ||
|
||
|
@@ -207,180 +205,152 @@ def test_ndarray_compat_properties(self, index_or_series_obj): | |
assert Index([1]).item() == 1 | ||
assert Series([1]).item() == 1 | ||
|
||
def test_value_counts_unique_nunique(self, index_or_series_obj): | ||
orig = index_or_series_obj | ||
obj = orig.copy() | ||
klass = type(obj) | ||
values = obj._values | ||
|
||
if orig.duplicated().any(): | ||
pytest.xfail( | ||
"The test implementation isn't flexible enough to deal " | ||
"with duplicated values. This isn't a bug in the " | ||
"application code, but in the test code." | ||
) | ||
def test_unique(self, index_or_series_obj): | ||
obj = index_or_series_obj | ||
obj = np.repeat(obj, range(1, len(obj) + 1)) | ||
result = obj.unique() | ||
|
||
# create repeated values, 'n'th element is repeated by n+1 times | ||
if isinstance(obj, Index): | ||
expected_index = Index(obj[::-1]) | ||
expected_index.name = None | ||
obj = obj.repeat(range(1, len(obj) + 1)) | ||
# dict.fromkeys preserves the order | ||
unique_values = list(dict.fromkeys(obj.values)) | ||
if isinstance(obj, pd.MultiIndex): | ||
expected = pd.MultiIndex.from_tuples(unique_values) | ||
expected.names = obj.names | ||
tm.assert_index_equal(result, expected) | ||
elif isinstance(obj, pd.Index): | ||
expected = pd.Index(unique_values, dtype=obj.dtype) | ||
if is_datetime64tz_dtype(obj): | ||
expected = expected.normalize() | ||
tm.assert_index_equal(result, expected) | ||
else: | ||
expected_index = Index(values[::-1]) | ||
idx = obj.index.repeat(range(1, len(obj) + 1)) | ||
# take-based repeat | ||
indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1)) | ||
rep = values.take(indices) | ||
obj = klass(rep, index=idx) | ||
|
||
# check values has the same dtype as the original | ||
assert obj.dtype == orig.dtype | ||
|
||
expected_s = Series( | ||
range(len(orig), 0, -1), index=expected_index, dtype="int64" | ||
) | ||
expected = np.array(unique_values) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = obj.value_counts() | ||
tm.assert_series_equal(result, expected_s) | ||
assert result.index.name is None | ||
@pytest.mark.parametrize("null_obj", [np.nan, None]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you use nulls_fixture There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, yeah we need to test these, can you create an issue. we will want to add these even if they need xfailing for now as there is no testing on them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. xref: #32437 |
||
def test_unique_null(self, null_obj, index_or_series_obj): | ||
obj = index_or_series_obj | ||
|
||
if not allow_na_ops(obj): | ||
pytest.skip("type doesn't allow for NA operations") | ||
elif len(obj) < 1: | ||
pytest.skip("Test doesn't make sense on empty data") | ||
elif isinstance(obj, pd.MultiIndex): | ||
pytest.skip(f"MultiIndex can't hold '{null_obj}'") | ||
|
||
values = obj.values | ||
if needs_i8_conversion(obj): | ||
values[0:2] = iNaT | ||
else: | ||
values[0:2] = null_obj | ||
|
||
klass = type(obj) | ||
repeated_values = np.repeat(values, range(1, len(values) + 1)) | ||
obj = klass(repeated_values, dtype=obj.dtype) | ||
result = obj.unique() | ||
if isinstance(obj, Index): | ||
assert isinstance(result, type(obj)) | ||
tm.assert_index_equal(result, orig) | ||
assert result.dtype == orig.dtype | ||
elif is_datetime64tz_dtype(obj): | ||
# datetimetz Series returns array of Timestamp | ||
assert result[0] == orig[0] | ||
for r in result: | ||
assert isinstance(r, Timestamp) | ||
|
||
tm.assert_numpy_array_equal( | ||
result.astype(object), orig._values.astype(object) | ||
) | ||
|
||
unique_values_raw = dict.fromkeys(obj.values) | ||
# because np.nan == np.nan is False, but None == None is True | ||
# np.nan would be duplicated, whereas None wouldn't | ||
unique_values_not_null = [ | ||
val for val in unique_values_raw if not pd.isnull(val) | ||
] | ||
unique_values = [null_obj] + unique_values_not_null | ||
|
||
if isinstance(obj, pd.Index): | ||
expected = pd.Index(unique_values, dtype=obj.dtype) | ||
if is_datetime64tz_dtype(obj): | ||
result = result.normalize() | ||
expected = expected.normalize() | ||
elif isinstance(obj, pd.CategoricalIndex): | ||
expected = expected.set_categories(unique_values_not_null) | ||
tm.assert_index_equal(result, expected) | ||
else: | ||
tm.assert_numpy_array_equal(result, orig.values) | ||
assert result.dtype == orig.dtype | ||
expected = np.array(unique_values, dtype=obj.dtype) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
# dropna=True would break for MultiIndex | ||
assert obj.nunique(dropna=False) == len(np.unique(obj.values)) | ||
def test_nunique(self, index_or_series_obj): | ||
obj = index_or_series_obj | ||
obj = np.repeat(obj, range(1, len(obj) + 1)) | ||
expected = len(obj.unique()) | ||
assert obj.nunique(dropna=False) == expected | ||
|
||
@pytest.mark.parametrize("null_obj", [np.nan, None]) | ||
def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj): | ||
orig = index_or_series_obj | ||
obj = orig.copy() | ||
klass = type(obj) | ||
values = obj._ndarray_values | ||
num_values = len(orig) | ||
def test_nunique_null(self, null_obj, index_or_series_obj): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
obj = index_or_series_obj | ||
|
||
if not allow_na_ops(obj): | ||
pytest.skip("type doesn't allow for NA operations") | ||
elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)): | ||
pytest.skip(f"values of {klass} cannot be changed") | ||
elif isinstance(orig, pd.MultiIndex): | ||
pytest.skip("MultiIndex doesn't support isna") | ||
elif orig.duplicated().any(): | ||
pytest.xfail( | ||
"The test implementation isn't flexible enough to deal " | ||
"with duplicated values. This isn't a bug in the " | ||
"application code, but in the test code." | ||
) | ||
|
||
# special assign to the numpy array | ||
if is_datetime64tz_dtype(obj): | ||
if isinstance(obj, DatetimeIndex): | ||
v = obj.asi8 | ||
v[0:2] = iNaT | ||
values = obj._shallow_copy(v) | ||
else: | ||
obj = obj.copy() | ||
obj[0:2] = pd.NaT | ||
values = obj._values | ||
elif isinstance(obj, pd.MultiIndex): | ||
pytest.skip(f"MultiIndex can't hold '{null_obj}'") | ||
|
||
elif is_period_dtype(obj): | ||
values[0:2] = iNaT | ||
parr = type(obj._data)(values, dtype=obj.dtype) | ||
values = obj._shallow_copy(parr) | ||
elif needs_i8_conversion(obj): | ||
values = obj.values | ||
if needs_i8_conversion(obj): | ||
values[0:2] = iNaT | ||
values = obj._shallow_copy(values) | ||
else: | ||
values[0:2] = null_obj | ||
|
||
# check values has the same dtype as the original | ||
assert values.dtype == obj.dtype | ||
|
||
# create repeated values, 'n'th element is repeated by n+1 | ||
# times | ||
if isinstance(obj, (DatetimeIndex, PeriodIndex)): | ||
expected_index = obj.copy() | ||
expected_index.name = None | ||
klass = type(obj) | ||
repeated_values = np.repeat(values, range(1, len(values) + 1)) | ||
obj = klass(repeated_values, dtype=obj.dtype) | ||
|
||
# attach name to klass | ||
obj = klass(values.repeat(range(1, len(obj) + 1))) | ||
obj.name = "a" | ||
else: | ||
if isinstance(obj, DatetimeIndex): | ||
expected_index = orig._values._shallow_copy(values) | ||
else: | ||
expected_index = Index(values) | ||
expected_index.name = None | ||
obj = obj.repeat(range(1, len(obj) + 1)) | ||
obj.name = "a" | ||
|
||
# check values has the same dtype as the original | ||
assert obj.dtype == orig.dtype | ||
|
||
# check values correctly have NaN | ||
nanloc = np.zeros(len(obj), dtype=np.bool) | ||
nanloc[:3] = True | ||
if isinstance(obj, Index): | ||
tm.assert_numpy_array_equal(pd.isna(obj), nanloc) | ||
if isinstance(obj, pd.CategoricalIndex): | ||
assert obj.nunique() == len(obj.categories) | ||
assert obj.nunique(dropna=False) == len(obj.categories) + 1 | ||
else: | ||
exp = Series(nanloc, obj.index, name="a") | ||
tm.assert_series_equal(pd.isna(obj), exp) | ||
|
||
expected_data = list(range(num_values, 2, -1)) | ||
expected_data_na = expected_data.copy() | ||
if expected_data_na: | ||
expected_data_na.append(3) | ||
expected_s_na = Series( | ||
expected_data_na, | ||
index=expected_index[num_values - 1 : 0 : -1], | ||
dtype="int64", | ||
name="a", | ||
) | ||
expected_s = Series( | ||
expected_data, | ||
index=expected_index[num_values - 1 : 1 : -1], | ||
dtype="int64", | ||
name="a", | ||
) | ||
num_unique_values = len(obj.unique()) | ||
assert obj.nunique() == max(0, num_unique_values - 1) | ||
assert obj.nunique(dropna=False) == max(0, num_unique_values) | ||
|
||
result_s_na = obj.value_counts(dropna=False) | ||
tm.assert_series_equal(result_s_na, expected_s_na) | ||
assert result_s_na.index.name is None | ||
assert result_s_na.name == "a" | ||
result_s = obj.value_counts() | ||
tm.assert_series_equal(obj.value_counts(), expected_s) | ||
assert result_s.index.name is None | ||
assert result_s.name == "a" | ||
def test_value_counts(self, index_or_series_obj): | ||
obj = index_or_series_obj | ||
obj = np.repeat(obj, range(1, len(obj) + 1)) | ||
result = obj.value_counts() | ||
|
||
result = obj.unique() | ||
if isinstance(obj, Index): | ||
tm.assert_index_equal(result, Index(values[1:], name="a")) | ||
elif is_datetime64tz_dtype(obj): | ||
# unable to compare NaT / nan | ||
tm.assert_extension_array_equal(result[1:], values[2:]) | ||
assert result[0] is pd.NaT | ||
elif len(obj) > 0: | ||
tm.assert_numpy_array_equal(result[1:], values[2:]) | ||
|
||
assert pd.isna(result[0]) | ||
assert result.dtype == orig.dtype | ||
|
||
assert obj.nunique() == max(0, num_values - 2) | ||
assert obj.nunique(dropna=False) == max(0, num_values - 1) | ||
counter = collections.Counter(obj) | ||
expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) | ||
expected.index = expected.index.astype(obj.dtype) | ||
if isinstance(obj, pd.MultiIndex): | ||
expected.index = pd.Index(expected.index) | ||
|
||
# sort_index to avoid switched order when values share the same count | ||
result = result.sort_index() | ||
expected = expected.sort_index() | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("null_obj", [np.nan, None]) | ||
def test_value_counts_null(self, null_obj, index_or_series_obj): | ||
orig = index_or_series_obj | ||
obj = orig.copy() | ||
|
||
if not allow_na_ops(obj): | ||
pytest.skip("type doesn't allow for NA operations") | ||
elif len(obj) < 1: | ||
pytest.skip("Test doesn't make sense on empty data") | ||
elif isinstance(orig, pd.MultiIndex): | ||
pytest.skip(f"MultiIndex can't hold '{null_obj}'") | ||
|
||
values = obj.values | ||
if needs_i8_conversion(obj): | ||
values[0:2] = iNaT | ||
else: | ||
values[0:2] = null_obj | ||
|
||
klass = type(obj) | ||
repeated_values = np.repeat(values, range(1, len(values) + 1)) | ||
obj = klass(repeated_values, dtype=obj.dtype) | ||
|
||
# because np.nan == np.nan is False, but None == None is True | ||
# np.nan would be duplicated, whereas None wouldn't | ||
counter = collections.Counter(obj.dropna()) | ||
expected = pd.Series(dict(counter.most_common()), dtype=np.int64) | ||
expected.index = expected.index.astype(obj.dtype) | ||
|
||
tm.assert_series_equal(obj.value_counts(), expected) | ||
|
||
# can't use expected[null_obj] = 3 as | ||
# IntervalIndex doesn't allow assignment | ||
new_entry = pd.Series({np.nan: 3}, dtype=np.int64) | ||
expected = expected.append(new_entry) | ||
tm.assert_series_equal(obj.value_counts(dropna=False), expected) | ||
|
||
def test_value_counts_inferred(self, index_or_series): | ||
klass = index_or_series | ||
|
Uh oh!
There was an error while loading. Please reload this page.