From e22de7ebe9263786a35dee44050cbaf4b80f163c Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Tue, 18 Jun 2019 15:17:15 +0100 Subject: [PATCH 1/3] Calculate normalized freqs in value_counts correctly when bins is not None --- pandas/core/algorithms.py | 4 +--- pandas/tests/test_algos.py | 23 +++++++++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 932ac71a23ed0..e7b46309b95a1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -670,7 +670,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False, # count, remove nulls (from the index), and but the bins result = ii.value_counts(dropna=dropna) - result = result[result.index.notna()] result.index = result.index.astype('interval') result = result.sort_index() @@ -678,8 +677,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if dropna and (result.values == 0).all(): result = result.iloc[0:0] - # normalizing is by len of all (regardless of dropna) - counts = np.array([len(ii)]) + counts = np.array([result.sum()]) else: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 64d8436dd5fe3..90c9b7a3591a4 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -967,21 +967,28 @@ def test_dropna(self): expected = Series([2, 1, 1], index=[5., 10.3, np.nan]) tm.assert_series_equal(result, expected) - def test_value_counts_normalized(self): + @pytest.mark.parametrize('dropna, vals, index', [ + (False, [0.6, 0.2, 0.2], [np.nan, 2.0, 1.0]), + (True, [0.5, 0.5], [2.0, 1.0])]) + def test_value_counts_normalized(self, dropna, vals, index): # GH12558 s = Series([1, 2, np.nan, np.nan, np.nan]) dtypes = (np.float64, np.object, 'M8[ns]') for t in dtypes: s_typed = s.astype(t) - result = s_typed.value_counts(normalize=True, dropna=False) - expected = Series([0.6, 0.2, 0.2], - index=Series([np.nan, 2.0, 1.0], dtype=t)) + result = s_typed.value_counts(normalize=True, dropna=dropna) + expected = Series(vals, index=Series(index, dtype=t)) tm.assert_series_equal(result, expected) - result = s_typed.value_counts(normalize=True, dropna=True) - expected = Series([0.5, 0.5], - index=Series([2.0, 1.0], dtype=t)) - tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('dropna, vals, tuples', [ + (False, [0.5, 0.3, 0.2], [(-0.005, 2.0), (2.0, 4.0), np.nan]), + (True, [0.625, 0.375], [(-0.005, 2.0), (2.0, 4.0)])]) + def test_value_counts_normalized_bins(self, dropna, vals, tuples): + # GH25970 + s = Series([1, 1, 2, 0, 1, np.nan, 4, 4, np.nan, 3]) + result = s.value_counts(normalize=True, bins=2, dropna=dropna) + expected = Series(vals, index=IntervalIndex.from_tuples(tuples)) + tm.assert_series_equal(result, expected) def test_value_counts_uint64(self): arr = np.array([2**63], dtype=np.uint64) From 5d1e2f88a84841aceb803eb4d01611400c78b22d Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Wed, 19 Jun 2019 19:51:30 +0100 Subject: [PATCH 2/3] Parametrize dtype in test_value_counts_normalized --- pandas/tests/test_algos.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 90c9b7a3591a4..16d75cad5f837 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -967,18 +967,18 @@ def test_dropna(self): expected = Series([2, 1, 1], index=[5., 10.3, np.nan]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('dropna, vals, index', [ (False, [0.6, 0.2, 0.2], [np.nan, 2.0, 1.0]), (True, [0.5, 0.5], [2.0, 1.0])]) - def test_value_counts_normalized(self, dropna, vals, index): + @pytest.mark.parametrize('dtype', [np.float64, np.object, 'M8[ns]']) + def test_value_counts_normalized(self, dropna, vals, index, dtype): # GH12558 s = Series([1, 2, np.nan, np.nan, np.nan]) - dtypes = (np.float64, np.object, 'M8[ns]') - for t in dtypes: - s_typed = s.astype(t) - result = s_typed.value_counts(normalize=True, dropna=dropna) - expected = Series(vals, index=Series(index, dtype=t)) - tm.assert_series_equal(result, expected) + s_typed = s.astype(dtype) + result = s_typed.value_counts(normalize=True, dropna=dropna) + expected = Series(vals, index=Series(index, dtype=dtype)) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize('dropna, vals, tuples', [ (False, [0.5, 0.3, 0.2], [(-0.005, 2.0), (2.0, 4.0), np.nan]), From 0496b799e4e0ed62262b694a89d8ddef6a0dcd21 Mon Sep 17 00:00:00 2001 From: krsnik93 Date: Wed, 19 Jun 2019 19:52:51 +0100 Subject: [PATCH 3/3] Remove extra empty line --- pandas/tests/test_algos.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 16d75cad5f837..0991072c442c2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -967,7 +967,6 @@ def test_dropna(self): expected = Series([2, 1, 1], index=[5., 10.3, np.nan]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, vals, index', [ (False, [0.6, 0.2, 0.2], [np.nan, 2.0, 1.0]), (True, [0.5, 0.5], [2.0, 1.0])])