From d0565b14ddf533f2e547205f3a63bd9600a79931 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Wed, 15 Dec 2021 17:29:30 +0530 Subject: [PATCH 01/19] Dtype is same --- pandas/core/internals/blocks.py | 2 +- pandas/tests/series/methods/test_replace.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f4b301c25c603..e40773b255c7a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -663,7 +663,7 @@ def replace( regex = should_use_regex(regex, to_replace) if regex: - return self._replace_regex(to_replace, value, inplace=inplace) + self.values = np.asarray(self._replace_regex(to_replace, value, inplace=inplace)) if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 28a0df99bb2b6..c21682813493a 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -512,3 +512,10 @@ def test_pandas_replace_na(self): result = ser.replace(regex_mapping, regex=True) exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) + + def test_replace_regex_dtype(self): + #GH-48644 + s = pd.Series(['0']) + exp = s.replace(to_replace = '0', value = 1, regex = False).dtype + res = s.replace(to_replace = '0', value = 1, regex = True).dtype + assert exp == res From 2fc7c96a9ff4e1b92438312c95c6cb7bd6729718 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Wed, 15 Dec 2021 17:33:04 +0530 Subject: [PATCH 02/19] pre-commit hook solved --- pandas/core/internals/blocks.py | 4 +++- pandas/tests/series/methods/test_replace.py | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e40773b255c7a..48b1c5a08d751 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -663,7 +663,9 @@ def replace( regex = should_use_regex(regex, to_replace) if regex: - self.values = np.asarray(self._replace_regex(to_replace, value, inplace=inplace)) + self.values = np.asarray( + self._replace_regex(to_replace, value, inplace=inplace) + ) if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index c21682813493a..09e3041c65eed 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -514,8 +514,8 @@ def test_pandas_replace_na(self): tm.assert_series_equal(result, exp) def test_replace_regex_dtype(self): - #GH-48644 - s = pd.Series(['0']) - exp = s.replace(to_replace = '0', value = 1, regex = False).dtype - res = s.replace(to_replace = '0', value = 1, regex = True).dtype + # GH-48644 + s = pd.Series(["0"]) + exp = s.replace(to_replace="0", value=1, regex=False).dtype + res = s.replace(to_replace="0", value=1, regex=True).dtype assert exp == res From cb4dd4ff1e3b1590cf3ce29e3dc7291f699d835e Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Wed, 15 Dec 2021 21:37:46 +0530 Subject: [PATCH 03/19] Update blocks.py --- pandas/core/internals/blocks.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 48b1c5a08d751..27b8a0b10669f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -661,11 +661,14 @@ def replace( return [blk] regex = should_use_regex(regex, to_replace) + mask = missing.mask_missing(values, to_replace) if regex: - self.values = np.asarray( - self._replace_regex(to_replace, value, inplace=inplace) - ) + if self._can_hold_element(value): + blk = self if inplace else self.copy() + putmask_inplace(blk.values, mask, value) + return blk.convert(numeric=False, copy=False) + return self._replace_regex(to_replace, value, inplace=inplace) if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that @@ -674,7 +677,6 @@ def replace( # replace_list instead of replace. return [self] if inplace else [self.copy()] - mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. From 1129c9acf3aab6ea20de8fde7f3499674366c009 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Wed, 15 Dec 2021 22:52:44 +0530 Subject: [PATCH 04/19] changed replace regex --- pandas/core/internals/blocks.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 27b8a0b10669f..04f7ae9d18356 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -664,10 +664,6 @@ def replace( mask = missing.mask_missing(values, to_replace) if regex: - if self._can_hold_element(value): - blk = self if inplace else self.copy() - putmask_inplace(blk.values, mask, value) - return blk.convert(numeric=False, copy=False) return self._replace_regex(to_replace, value, inplace=inplace) if not self._can_hold_element(to_replace): @@ -743,7 +739,10 @@ def _replace_regex( replace_regex(new_values, rx, value, mask) block = self.make_block(new_values) - return [block] + if (self.ndim == 1 or self.shape[0] == 1): + return block.convert(numeric=False, copy=False) + else: + return [block] @final def _replace_list( From 1a1c378229e1b263119b6bcf0e2e4e0e45f72c28 Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Wed, 15 Dec 2021 22:54:21 +0530 Subject: [PATCH 05/19] precommit solved --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 04f7ae9d18356..b50c45952eb63 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -739,7 +739,7 @@ def _replace_regex( replace_regex(new_values, rx, value, mask) block = self.make_block(new_values) - if (self.ndim == 1 or self.shape[0] == 1): + if self.ndim == 1 or self.shape[0] == 1: return block.convert(numeric=False, copy=False) else: return [block] From eb06d439567db2c5c340c2a0132e9e7da50c8e99 Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Fri, 17 Dec 2021 15:33:06 +0530 Subject: [PATCH 06/19] Changed test to assert series equal --- pandas/tests/series/methods/test_replace.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 09e3041c65eed..f6540f7105009 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -516,6 +516,6 @@ def test_pandas_replace_na(self): def test_replace_regex_dtype(self): # GH-48644 s = pd.Series(["0"]) - exp = s.replace(to_replace="0", value=1, regex=False).dtype - res = s.replace(to_replace="0", value=1, regex=True).dtype - assert exp == res + exp = s.replace(to_replace="0", value=1, regex=False) + res = s.replace(to_replace="0", value=1, regex=True) + tm.assert_series_equal(res, exp) From fdbd5f50b4f651dbedffec82db3bde8e5468ecfd Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Fri, 17 Dec 2021 16:14:49 +0530 Subject: [PATCH 07/19] Changes test --- pandas/tests/series/methods/test_replace.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index f6540f7105009..e822dfa5d2d57 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -516,6 +516,8 @@ def test_pandas_replace_na(self): def test_replace_regex_dtype(self): # GH-48644 s = pd.Series(["0"]) - exp = s.replace(to_replace="0", value=1, regex=False) - res = s.replace(to_replace="0", value=1, regex=True) - tm.assert_series_equal(res, exp) + expected = pd.Series([1]) + result_series_false = s.replace(to_replace="0", value=1, regex=False) + tm.assert_series_equal(result_series_false, expected) + result_series_true = s.replace(to_replace="0", value=1, regex=True) + tm.assert_series_equal(result_series_true, expected) From 087aad27a4c1fbac9f2c67658e70523f33b1f03e Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Fri, 17 Dec 2021 22:02:36 +0530 Subject: [PATCH 08/19] Added parameterise --- pandas/tests/series/methods/test_replace.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index e822dfa5d2d57..33e4989aa9769 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -513,11 +513,10 @@ def test_pandas_replace_na(self): exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) - def test_replace_regex_dtype(self): + @pytest.mark.parametrize("frame", [False, True]) + def test_replace_regex_dtype(self, frame): # GH-48644 - s = pd.Series(["0"]) + series = pd.Series(["0"]) expected = pd.Series([1]) - result_series_false = s.replace(to_replace="0", value=1, regex=False) - tm.assert_series_equal(result_series_false, expected) - result_series_true = s.replace(to_replace="0", value=1, regex=True) - tm.assert_series_equal(result_series_true, expected) + result = series.replace(to_replace="0", value=1, regex=frame) + tm.assert_series_equal(result, expected) From 54f56766794c58968ca63c2d89295c11b69538fc Mon Sep 17 00:00:00 2001 From: shubham11941140 <63910248+shubham11941140@users.noreply.github.com> Date: Sat, 18 Dec 2021 00:29:21 +0530 Subject: [PATCH 09/19] changed --- pandas/core/internals/blocks.py | 4 ++++ pandas/tests/series/methods/test_replace.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index da2ff58ea3d0d..0507694e984f2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -731,6 +731,10 @@ def _replace_regex( replace_regex(new_values, rx, value, mask) block = self.make_block(new_values) + if self.ndim == 1 or self.shape[0] == 1: + return block.convert(numeric=False, copy=False) + else: + return [block] return [block] @final diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 3a55062af618f..b720e2014f221 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -514,3 +514,11 @@ def test_pandas_replace_na(self): result = ser.replace(regex_mapping, regex=True) exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize("frame", [False, True]) + def test_replace_regex_dtype(self, frame): + # GH-48644 + series = pd.Series(["0"]) + expected = pd.Series([1]) + result = series.replace(to_replace="0", value=1, regex=frame) + tm.assert_series_equal(result, expected) From a4387628234772726b833df58e2b825823cf208c Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Sat, 18 Dec 2021 11:01:10 +0530 Subject: [PATCH 10/19] Added test --- pandas/core/internals/blocks.py | 2 +- pandas/tests/frame/methods/test_replace.py | 7 +++++++ pandas/tests/series/methods/test_replace.py | 6 +++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 09cac85384d81..8a608ec3c89d3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -731,7 +731,7 @@ def _replace_regex( replace_regex(new_values, rx, value, mask) block = self.make_block(new_values) - if self.ndim == 1 or self.shape[0] == 1: + if (self.ndim == 1 or self.shape[0] == 1) or (self.ndim == 2 and self.shape[0] != 1): return block.convert(numeric=False, copy=False) else: return [block] diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 28e28490c73b9..9ce646a4ecddc 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1503,3 +1503,10 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_frame(self, regex): + df = pd.DataFrame({"A": ["0"], "B": ["0"]}) + expected = pd.DataFrame({"A": [1], "B": [1]}) + result = df.replace(to_replace="0", value=1, regex=regex) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index b720e2014f221..da9068b6f8ee7 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -515,10 +515,10 @@ def test_pandas_replace_na(self): exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) - @pytest.mark.parametrize("frame", [False, True]) - def test_replace_regex_dtype(self, frame): + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype(self, regex): # GH-48644 series = pd.Series(["0"]) expected = pd.Series([1]) - result = series.replace(to_replace="0", value=1, regex=frame) + result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) From 96171658c00c0e8cec2c05068222252ada81243c Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Sat, 18 Dec 2021 11:04:07 +0530 Subject: [PATCH 11/19] precommit --- pandas/core/internals/blocks.py | 4 +++- pandas/tests/frame/methods/test_replace.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8a608ec3c89d3..c182c6ea8aa3d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -731,7 +731,9 @@ def _replace_regex( replace_regex(new_values, rx, value, mask) block = self.make_block(new_values) - if (self.ndim == 1 or self.shape[0] == 1) or (self.ndim == 2 and self.shape[0] != 1): + if (self.ndim == 1 or self.shape[0] == 1) or ( + self.ndim == 2 and self.shape[0] != 1 + ): return block.convert(numeric=False, copy=False) else: return [block] diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 9ce646a4ecddc..5d06fff16995c 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1506,7 +1506,7 @@ def test_regex_replace_scalar( @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): - df = pd.DataFrame({"A": ["0"], "B": ["0"]}) - expected = pd.DataFrame({"A": [1], "B": [1]}) + df = DataFrame({"A": ["0"], "B": ["0"]}) + expected = DataFrame({"A": [1], "B": [1]}) result = df.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result, expected) From 36e9a5d0c299eadefa04bb2450994c33520e00ef Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Sat, 18 Dec 2021 12:08:30 +0530 Subject: [PATCH 12/19] Added extra testcase --- pandas/core/internals/blocks.py | 7 +------ pandas/tests/frame/methods/test_replace.py | 14 ++++++++++---- pandas/tests/series/methods/test_replace.py | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c182c6ea8aa3d..9ae2cdc96e02b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -731,12 +731,7 @@ def _replace_regex( replace_regex(new_values, rx, value, mask) block = self.make_block(new_values) - if (self.ndim == 1 or self.shape[0] == 1) or ( - self.ndim == 2 and self.shape[0] != 1 - ): - return block.convert(numeric=False, copy=False) - else: - return [block] + return block.convert(numeric=False, copy=False) @final def replace_list( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 5d06fff16995c..95fc3bddc6f78 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1506,7 +1506,13 @@ def test_regex_replace_scalar( @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): - df = DataFrame({"A": ["0"], "B": ["0"]}) - expected = DataFrame({"A": [1], "B": [1]}) - result = df.replace(to_replace="0", value=1, regex=regex) - tm.assert_frame_equal(result, expected) + # GH-48644 + df1 = DataFrame({"A": ["0"], "B": ["0"]}) + expected_df1 = DataFrame({"A": [1], "B": [1]}) + result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + tm.assert_frame_equal(result_df1, expected_df1) + + df2 = DataFrame({"A": ["0"], "B": [np.NaN]}) + expected_df2 = DataFrame({"A": [1], "B": [np.NaN]}) + result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + tm.assert_frame_equal(result_df2, expected_df2) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index da9068b6f8ee7..9dfb361527b3e 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -516,7 +516,7 @@ def test_pandas_replace_na(self): tm.assert_series_equal(result, exp) @pytest.mark.parametrize("regex", [False, True]) - def test_replace_regex_dtype(self, regex): + def test_replace_regex_dtype_series(self, regex): # GH-48644 series = pd.Series(["0"]) expected = pd.Series([1]) From f04d05d668066ab9ef382360bcca92fcd0cfe3b1 Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Sat, 18 Dec 2021 23:03:30 +0530 Subject: [PATCH 13/19] Changed to string --- pandas/tests/frame/methods/test_replace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 95fc3bddc6f78..65bd2207c00a7 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1512,7 +1512,7 @@ def test_replace_regex_dtype_frame(self, regex): result_df1 = df1.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) - df2 = DataFrame({"A": ["0"], "B": [np.NaN]}) - expected_df2 = DataFrame({"A": [1], "B": [np.NaN]}) + df2 = DataFrame({"A": ["0"], "B": ["1"]}) + expected_df2 = DataFrame({"A": [1], "B": ["1"]}) result_df2 = df2.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) From ddbb1135cc9026ebccda3a100e883997f981095b Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Sun, 19 Dec 2021 10:21:07 +0530 Subject: [PATCH 14/19] whatsnew note added - 1.4 --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index caf3a4281561f..52c29236a8da1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -816,7 +816,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` with ``ExtensionDtype`` columns incorrectly raising (:issue:`43561`) - Bug in :meth:`Series.unstack` with object doing unwanted type inference on resulting columns (:issue:`44595`) - Bug in :class:`MultiIndex` failing join operations with overlapping ``IntervalIndex`` levels (:issue:`44096`) -- +- Bug in :class:`replace` results is different ``dtype`` based on `regex` (:issue:`44864`) Sparse ^^^^^^ From 75e3bd8229bd918e6e5140ad6db6a6dc061a3267 Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Sun, 19 Dec 2021 10:24:21 +0530 Subject: [PATCH 15/19] whatsnew fixed --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 52c29236a8da1..adac1321a35ac 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -816,7 +816,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` with ``ExtensionDtype`` columns incorrectly raising (:issue:`43561`) - Bug in :meth:`Series.unstack` with object doing unwanted type inference on resulting columns (:issue:`44595`) - Bug in :class:`MultiIndex` failing join operations with overlapping ``IntervalIndex`` levels (:issue:`44096`) -- Bug in :class:`replace` results is different ``dtype`` based on `regex` (:issue:`44864`) +- Bug in :class:`replace` results is different ``dtype`` based on `regex` parameter (:issue:`44864`) Sparse ^^^^^^ From af0585c82944a7c453917a4d70b69ae5c0ae2d23 Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Sun, 19 Dec 2021 10:32:27 +0530 Subject: [PATCH 16/19] whatsnew-fix --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index adac1321a35ac..b56f176a8629e 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -816,7 +816,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` with ``ExtensionDtype`` columns incorrectly raising (:issue:`43561`) - Bug in :meth:`Series.unstack` with object doing unwanted type inference on resulting columns (:issue:`44595`) - Bug in :class:`MultiIndex` failing join operations with overlapping ``IntervalIndex`` levels (:issue:`44096`) -- Bug in :class:`replace` results is different ``dtype`` based on `regex` parameter (:issue:`44864`) +- Bug in :func:`replace` results is different ``dtype`` based on ``regex`` parameter (:issue:`44864`) Sparse ^^^^^^ From 2a80b697988f640642ff61655d9da060ee57713c Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Mon, 20 Dec 2021 10:30:20 +0530 Subject: [PATCH 17/19] fix1 --- .gitignore | 2 + .../development/contributing_codebase.rst | 42 +- doc/source/development/developer.rst | 2 +- doc/source/reference/groupby.rst | 1 + doc/source/user_guide/io.rst | 12 +- doc/source/user_guide/timeseries.rst | 2 +- doc/source/whatsnew/v1.4.0.rst | 15 +- pandas/__init__.py | 8 +- pandas/_libs/sparse_op_helper.pxi.in | 5 + pandas/_libs/tslibs/offsets.pyx | 16 +- pandas/_libs/tslibs/period.pyx | 5 +- pandas/_libs/tslibs/timestamps.pyi | 96 ++-- pandas/_libs/tslibs/timestamps.pyx | 1 - pandas/_testing/_hypothesis.py | 4 + pandas/core/array_algos/replace.py | 3 +- pandas/core/arrays/sparse/array.py | 10 + pandas/core/base.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/base.py | 1 + pandas/core/groupby/generic.py | 189 ++++++++ pandas/core/groupby/groupby.py | 37 +- pandas/core/groupby/grouper.py | 2 +- pandas/core/indexes/base.py | 3 +- pandas/core/indexes/range.py | 6 +- pandas/core/internals/blocks.py | 47 +- pandas/io/json/_json.py | 4 +- pandas/io/json/_table_schema.py | 14 +- pandas/io/parsers/base_parser.py | 19 +- pandas/tests/api/test_api.py | 3 - pandas/tests/arithmetic/conftest.py | 8 +- pandas/tests/arithmetic/test_datetime64.py | 332 +++++-------- pandas/tests/arithmetic/test_numeric.py | 58 +-- pandas/tests/arithmetic/test_timedelta64.py | 201 +++----- .../tests/arrays/categorical/test_replace.py | 12 +- .../tests/arrays/sparse/test_arithmetics.py | 40 +- pandas/tests/extension/date/__init__.py | 6 + pandas/tests/extension/date/array.py | 180 +++++++ pandas/tests/extension/decimal/array.py | 7 +- pandas/tests/frame/indexing/test_where.py | 17 +- pandas/tests/frame/methods/test_replace.py | 13 +- pandas/tests/groupby/conftest.py | 1 + pandas/tests/groupby/test_allowlist.py | 1 + .../tests/groupby/test_frame_value_counts.py | 444 ++++++++++++++++++ pandas/tests/groupby/test_indexing.py | 18 + pandas/tests/groupby/test_numba.py | 12 +- pandas/tests/indexes/common.py | 15 + pandas/tests/indexes/interval/test_base.py | 6 + .../period/methods/test_to_timestamp.py | 30 ++ pandas/tests/indexing/test_coercion.py | 17 +- pandas/tests/internals/test_internals.py | 3 - pandas/tests/io/formats/test_to_html.py | 88 ++-- .../tests/io/json/test_json_table_schema.py | 5 +- .../json/test_json_table_schema_ext_dtype.py | 265 +++++++++++ pandas/tests/io/json/test_pandas.py | 2 +- .../io/parser/common/test_file_buffer_url.py | 1 - pandas/tests/io/parser/test_header.py | 27 +- pandas/tests/io/parser/test_index_col.py | 20 + .../io/parser/usecols/test_usecols_basic.py | 22 + pandas/tests/io/pytables/test_store.py | 3 +- pandas/tests/io/test_html.py | 21 +- pandas/tests/io/test_sql.py | 2 +- .../tests/reductions/test_stat_reductions.py | 7 +- .../reshape/concat/test_append_common.py | 431 +++++++++-------- pandas/tests/reshape/concat/test_index.py | 49 ++ pandas/tests/reshape/merge/test_merge.py | 2 +- pandas/tests/series/indexing/test_get.py | 3 +- pandas/tests/series/indexing/test_setitem.py | 2 +- pandas/tests/series/methods/test_replace.py | 130 ++++- pandas/tests/strings/test_split_partition.py | 4 +- pandas/tests/test_algos.py | 4 +- pandas/tests/test_sorting.py | 402 ++++++++-------- pandas/tests/tseries/offsets/test_offsets.py | 64 ++- pandas/tests/window/moments/conftest.py | 71 +-- .../moments/test_moments_consistency_ewm.py | 207 ++++---- .../test_moments_consistency_expanding.py | 189 ++++---- .../test_moments_consistency_rolling.py | 255 +++++----- 76 files changed, 2798 insertions(+), 1452 deletions(-) create mode 100644 pandas/tests/extension/date/__init__.py create mode 100644 pandas/tests/extension/date/array.py create mode 100644 pandas/tests/groupby/test_frame_value_counts.py create mode 100644 pandas/tests/io/json/test_json_table_schema_ext_dtype.py diff --git a/.gitignore b/.gitignore index 2c337be60e94e..87224f1d6060f 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,8 @@ dist *.egg-info .eggs .pypirc +# type checkers +pandas/py.typed # tox testing tool .tox diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 4cea030546635..41fe88e02318a 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -303,7 +303,7 @@ pandas strongly encourages the use of :pep:`484` style type hints. New developme Style guidelines ~~~~~~~~~~~~~~~~ -Types imports should follow the ``from typing import ...`` convention. So rather than +Type imports should follow the ``from typing import ...`` convention. Some types do not need to be imported since :pep:`585` some builtin constructs, such as ``list`` and ``tuple``, can directly be used for type annotations. So rather than .. code-block:: python @@ -315,21 +315,31 @@ You should write .. code-block:: python - from typing import List, Optional, Union + primes: list[int] = [] - primes: List[int] = [] +``Optional`` should be avoided in favor of the shorter ``| None``, so instead of -``Optional`` should be used where applicable, so instead of +.. code-block:: python + + from typing import Union + + maybe_primes: list[Union[int, None]] = [] + +or .. code-block:: python - maybe_primes: List[Union[int, None]] = [] + from typing import Optional + + maybe_primes: list[Optional[int]] = [] You should write .. code-block:: python - maybe_primes: List[Optional[int]] = [] + from __future__ import annotations # noqa: F404 + + maybe_primes: list[int | None] = [] In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like @@ -410,6 +420,26 @@ A recent version of ``numpy`` (>=1.21.0) is required for type validation. .. _contributing.ci: +Testing type hints in code using pandas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + * Pandas is not yet a py.typed library (:pep:`561`)! + The primary purpose of locally declaring pandas as a py.typed library is to test and + improve the pandas-builtin type annotations. + +Until pandas becomes a py.typed library, it is possible to easily experiment with the type +annotations shipped with pandas by creating an empty file named "py.typed" in the pandas +installation folder: + +.. code-block:: none + + python -c "import pandas; import pathlib; (pathlib.Path(pandas.__path__[0]) / 'py.typed').touch()" + +The existence of the py.typed file signals to type checkers that pandas is already a py.typed +library. This makes type checkers aware of the type annotations shipped with pandas. + Testing with continuous integration ----------------------------------- diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index d701208792a4c..6de237b70f08d 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -180,7 +180,7 @@ As an example of fully-formed metadata: 'numpy_type': 'int64', 'metadata': None} ], - 'pandas_version': '0.20.0', + 'pandas_version': '1.4.0', 'creator': { 'library': 'pyarrow', 'version': '0.13.0' diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index ccf130d03418c..2bb0659264eb0 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -122,6 +122,7 @@ application to columns of a specific data type. DataFrameGroupBy.skew DataFrameGroupBy.take DataFrameGroupBy.tshift + DataFrameGroupBy.value_counts The following methods are available only for ``SeriesGroupBy`` objects. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cd7105d125947..403599297a492 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1903,6 +1903,7 @@ with optional parameters: ``index``; dict like {index -> {column -> value}} ``columns``; dict like {column -> {index -> value}} ``values``; just the values array + ``table``; adhering to the JSON `Table Schema`_ * ``date_format`` : string, type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601. * ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. @@ -2477,7 +2478,6 @@ A few notes on the generated table schema: * For ``MultiIndex``, ``mi.names`` is used. If any level has no name, then ``level_`` is used. - ``read_json`` also accepts ``orient='table'`` as an argument. This allows for the preservation of metadata such as dtypes and index names in a round-trippable manner. @@ -2519,8 +2519,18 @@ indicate missing values and the subsequent read cannot distinguish the intent. os.remove("test.json") +When using ``orient='table'`` along with user-defined ``ExtensionArray``, +the generated schema will contain an additional ``extDtype`` key in the respective +``fields`` element. This extra key is not standard but does enable JSON roundtrips +for extension types (e.g. ``read_json(df.to_json(orient="table"), orient="table")``). + +The ``extDtype`` key carries the name of the extension, if you have properly registered +the ``ExtensionDtype``, pandas will use said name to perform a lookup into the registry +and re-convert the serialized data into your custom dtype. + .. _Table Schema: https://specs.frictionlessdata.io/table-schema/ + HTML ---- diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index fde9ff0450a12..3fd6fe67772bc 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2424,7 +2424,7 @@ you can use the ``tz_convert`` method. For ``pytz`` time zones, it is incorrect to pass a time zone object directly into the ``datetime.datetime`` constructor - (e.g., ``datetime.datetime(2011, 1, 1, tz=pytz.timezone('US/Eastern'))``. + (e.g., ``datetime.datetime(2011, 1, 1, tzinfo=pytz.timezone('US/Eastern'))``. Instead, the datetime needs to be localized using the ``localize`` method on the ``pytz`` time zone object. diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index b56f176a8629e..1840b8d1b1ba9 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -217,9 +217,10 @@ Other enhancements - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) +- Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`) - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) -- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, and :meth:`.GroupBy.var` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`) +- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) - :meth:`NaT.to_numpy` ``dtype`` argument is now respected, so ``np.timedelta64`` can be returned (:issue:`44460`) - New option ``display.max_dir_items`` customizes the number of columns added to :meth:`Dataframe.__dir__` and suggested for tab completion (:issue:`37996`) @@ -231,6 +232,7 @@ Other enhancements - :meth:`UInt64Index.map` now retains ``dtype`` where possible (:issue:`44609`) - :meth:`read_json` can now parse unsigned long long integers (:issue:`26068`) - :meth:`DataFrame.take` now raises a ``TypeError`` when passed a scalar for the indexer (:issue:`42875`) +- :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`). - @@ -454,6 +456,7 @@ Other API changes - :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`) - Change in the position of the ``min_rows`` argument in :meth:`DataFrame.to_string` due to change in the docstring (:issue:`44304`) - Reduction operations for :class:`DataFrame` or :class:`Series` now raising a ``ValueError`` when ``None`` is passed for ``skipna`` (:issue:`44178`) +- :func:`read_csv` and :func:`read_html` no longer raising an error when one of the header rows consists only of ``Unnamed:`` columns (:issue:`13054`) - Changed the ``name`` attribute of several holidays in ``USFederalHolidayCalendar`` to match `official federal holiday names `_ @@ -529,7 +532,7 @@ Other Deprecations - Deprecated silent dropping of columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a dictionary (:issue:`43740`) - Deprecated silent dropping of columns that raised a ``TypeError``, ``DataError``, and some cases of ``ValueError`` in :meth:`Series.aggregate`, :meth:`DataFrame.aggregate`, :meth:`Series.groupby.aggregate`, and :meth:`DataFrame.groupby.aggregate` when used with a list (:issue:`43740`) - Deprecated casting behavior when setting timezone-aware value(s) into a timezone-aware :class:`Series` or :class:`DataFrame` column when the timezones do not match. Previously this cast to object dtype. In a future version, the values being inserted will be converted to the series or column's existing timezone (:issue:`37605`) -- Deprecated casting behavior when passing an item with mismatched-timezone to :meth:`DatetimeIndex.insert`, :meth:`DatetimeIndex.putmask`, :meth:`DatetimeIndex.where` :meth:`DatetimeIndex.fillna`, :meth:`Series.mask`, :meth:`Series.where`, :meth:`Series.fillna`, :meth:`Series.shift`, :meth:`Series.replace`, :meth:`Series.reindex` (and :class:`DataFrame` column analogues). In the past this has cast to object dtype. In a future version, these will cast the passed item to the index or series's timezone (:issue:`37605`) +- Deprecated casting behavior when passing an item with mismatched-timezone to :meth:`DatetimeIndex.insert`, :meth:`DatetimeIndex.putmask`, :meth:`DatetimeIndex.where` :meth:`DatetimeIndex.fillna`, :meth:`Series.mask`, :meth:`Series.where`, :meth:`Series.fillna`, :meth:`Series.shift`, :meth:`Series.replace`, :meth:`Series.reindex` (and :class:`DataFrame` column analogues). In the past this has cast to object dtype. In a future version, these will cast the passed item to the index or series's timezone (:issue:`37605`,:issue:`44940`) - Deprecated the 'errors' keyword argument in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, and meth:`DataFrame.mask`; in a future version the argument will be removed (:issue:`44294`) - Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`) - Deprecated :meth:`PeriodIndex.astype` to ``datetime64[ns]`` or ``DatetimeTZDtype``, use ``obj.to_timestamp(how).tz_localize(dtype.tz)`` instead (:issue:`44398`) @@ -540,6 +543,7 @@ Other Deprecations - Deprecated parameter ``names`` in :meth:`Index.copy` (:issue:`44916`) - A deprecation warning is now shown for :meth:`DataFrame.to_latex` indicating the arguments signature may change and emulate more the arguments to :meth:`.Styler.to_latex` in future versions (:issue:`44411`) - Deprecated :meth:`Categorical.replace`, use :meth:`Series.replace` instead (:issue:`44929`) +- Deprecated :meth:`Index.__getitem__` with a bool key; use ``index.values[key]`` to get the old behavior (:issue:`44051`) - .. --------------------------------------------------------------------------- @@ -627,6 +631,7 @@ Datetimelike - Bug in adding a ``np.timedelta64`` object to a :class:`BusinessDay` or :class:`CustomBusinessDay` object incorrectly raising (:issue:`44532`) - Bug in :meth:`Index.insert` for inserting ``np.datetime64``, ``np.timedelta64`` or ``tuple`` into :class:`Index` with ``dtype='object'`` with negative loc adding ``None`` and replacing existing value (:issue:`44509`) - Bug in :meth:`Series.mode` with ``DatetimeTZDtype`` incorrectly returning timezone-naive and ``PeriodDtype`` incorrectly raising (:issue:`41927`) +- Bug in :class:`DateOffset`` addition with :class:`Timestamp` where ``offset.nanoseconds`` would not be included in the result. (:issue:`43968`) - Timedelta @@ -760,6 +765,7 @@ I/O - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) - Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`, :issue:`34120`) - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`) +- Bug in :func:`read_csv` not setting name of :class:`MultiIndex` columns correctly when ``index_col`` is not the first column (:issue:`38549`) - Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`) - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) - @@ -769,6 +775,7 @@ Period - Bug in adding a :class:`Period` object to a ``np.timedelta64`` object incorrectly raising ``TypeError`` (:issue:`44182`) - Bug in :meth:`PeriodIndex.to_timestamp` when the index has ``freq="B"`` inferring ``freq="D"`` for its result instead of ``freq="B"`` (:issue:`44105`) - Bug in :class:`Period` constructor incorrectly allowing ``np.timedelta64("NaT")`` (:issue:`44507`) +- Bug in :meth:`PeriodIndex.to_timestamp` giving incorrect values for indexes with non-contiguous data (:issue:`44100`) - Plotting @@ -794,6 +801,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.mean` failing with ``complex`` dtype (:issue:`43701`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly for the first row when ``center=True`` and index is decreasing (:issue:`43927`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` for centered datetimelike windows with uneven nanosecond (:issue:`43997`) +- Bug in :meth:`GroupBy.mean` raising ``KeyError`` when column was selected at least twice (:issue:`44924`) - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`3944`) - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` when using a :class:`pandas.api.indexers.BaseIndexer` subclass that returned unequal start and end arrays would segfault instead of raising a ``ValueError`` (:issue:`44470`) @@ -824,6 +832,7 @@ Sparse - Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`) - Bug in :meth:`DataFrame.sparse.to_coo` silently converting non-zero fill values to zero (:issue:`24817`) - Bug in :class:`SparseArray` comparison methods with an array-like operand of mismatched length raising ``AssertionError`` or unclear ``ValueError`` depending on the input (:issue:`43863`) +- Bug in :class:`SparseArray` arithmetic methods ``floordiv`` and ``mod`` behaviors when dividing by zero not matching the non-sparse :class:`Series` behavior (:issue:`38172`) - ExtensionArray @@ -837,7 +846,7 @@ ExtensionArray - Bug in :func:`array` incorrectly raising when passed a ``ndarray`` with ``float16`` dtype (:issue:`44715`) - Bug in calling ``np.sqrt`` on :class:`BooleanArray` returning a malformed :class:`FloatingArray` (:issue:`44715`) - Bug in :meth:`Series.where` with ``ExtensionDtype`` when ``other`` is a NA scalar incompatible with the series dtype (e.g. ``NaT`` with a numeric dtype) incorrectly casting to a compatible NA value (:issue:`44697`) -- +- Fixed bug in :meth:`Series.replace` with ``FloatDtype``, ``string[python]``, or ``string[pyarrow]`` dtype not being preserved when possible (:issue:`33484`) Styler ^^^^^^ diff --git a/pandas/__init__.py b/pandas/__init__.py index 9505d0481ee19..6ee0cf5ae07d5 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -23,13 +23,15 @@ try: from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib -except ImportError as e: # pragma: no cover - module = e.name +except ImportError as err: # pragma: no cover + module = err.name raise ImportError( f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " "'python setup.py build_ext --force' to build the C extensions first." - ) from e + ) from err +else: + del _tslib, _lib, _hashtable from pandas._config import ( get_option, diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index c6e65f8b96187..e6a2c7b1b050a 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -42,6 +42,11 @@ cdef inline sparse_t __mod__(sparse_t a, sparse_t b): cdef inline sparse_t __floordiv__(sparse_t a, sparse_t b): if b == 0: if sparse_t is float64_t: + # Match non-sparse Series behavior implemented in mask_zero_div_zero + if a > 0: + return INF + elif a < 0: + return -INF return NaN else: return 0 diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 7e6d8fa38aa45..6df4abc160b0b 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -186,8 +186,9 @@ def apply_wraps(func): if self.normalize: result = result.normalize() - # nanosecond may be deleted depending on offset process - if not self.normalize and nano != 0: + # If the offset object does not have a nanoseconds component, + # the result's nanosecond component may be lost. + if not self.normalize and nano != 0 and not hasattr(self, "nanoseconds"): if result.nanosecond != nano: if result.tz is not None: # convert to UTC @@ -333,7 +334,7 @@ cdef _determine_offset(kwds): # sub-daily offset - use timedelta (tz-aware) offset = timedelta(**kwds_no_nanos) else: - offset = timedelta(1) + offset = timedelta(0) return offset, use_relativedelta @@ -1068,12 +1069,17 @@ cdef class RelativeDeltaOffset(BaseOffset): # perform calculation in UTC other = other.replace(tzinfo=None) + if hasattr(self, "nanoseconds"): + td_nano = Timedelta(nanoseconds=self.nanoseconds) + else: + td_nano = Timedelta(0) + if self.n > 0: for i in range(self.n): - other = other + self._offset + other = other + self._offset + td_nano else: for i in range(-self.n): - other = other - self._offset + other = other - self._offset - td_nano if tzinfo is not None and self._use_relativedelta: # bring tz back from UTC calculation diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 67696f9740ea1..1df1c9a947e8d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1088,6 +1088,7 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): """ cdef: Py_ssize_t n = len(arr) + Py_ssize_t increment = arr.strides[0] // 8 ndarray[int64_t] result = np.empty(n, dtype=np.int64) _period_asfreq( @@ -1097,6 +1098,7 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): freq1, freq2, end, + increment, ) return result @@ -1110,6 +1112,7 @@ cdef void _period_asfreq( int freq1, int freq2, bint end, + Py_ssize_t increment=1, ): """See period_asfreq.__doc__""" cdef: @@ -1127,7 +1130,7 @@ cdef void _period_asfreq( get_asfreq_info(freq1, freq2, end, &af_info) for i in range(length): - val = ordinals[i] + val = ordinals[i * increment] if val != NPY_NAT: val = func(val, &af_info) out[i] = val diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 17df594a39c44..ecddd83322bbf 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -8,7 +8,6 @@ from datetime import ( from time import struct_time from typing import ( ClassVar, - Type, TypeVar, overload, ) @@ -22,9 +21,9 @@ from pandas._libs.tslibs import ( Timedelta, ) -_S = TypeVar("_S") +_DatetimeT = TypeVar("_DatetimeT", bound=datetime) -def integer_op_not_supported(obj) -> TypeError: ... +def integer_op_not_supported(obj: object) -> TypeError: ... class Timestamp(datetime): min: ClassVar[Timestamp] @@ -35,7 +34,7 @@ class Timestamp(datetime): # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") def __new__( # type: ignore[misc] - cls: Type[_S], + cls: type[_DatetimeT], ts_input: int | np.integer | float @@ -43,9 +42,9 @@ class Timestamp(datetime): | _date | datetime | np.datetime64 = ..., - freq=..., + freq: int | None | str | BaseOffset = ..., tz: str | _tzinfo | None | int = ..., - unit=..., + unit: str | int | None = ..., year: int | None = ..., month: int | None = ..., day: int | None = ..., @@ -57,7 +56,7 @@ class Timestamp(datetime): tzinfo: _tzinfo | None = ..., *, fold: int | None = ..., - ) -> _S | NaTType: ... + ) -> _DatetimeT | NaTType: ... def _set_freq(self, freq: BaseOffset | None) -> None: ... @property def year(self) -> int: ... @@ -80,24 +79,30 @@ class Timestamp(datetime): @property def fold(self) -> int: ... @classmethod - def fromtimestamp(cls: Type[_S], t: float, tz: _tzinfo | None = ...) -> _S: ... + def fromtimestamp( + cls: type[_DatetimeT], t: float, tz: _tzinfo | None = ... + ) -> _DatetimeT: ... @classmethod - def utcfromtimestamp(cls: Type[_S], t: float) -> _S: ... + def utcfromtimestamp(cls: type[_DatetimeT], t: float) -> _DatetimeT: ... @classmethod - def today(cls: Type[_S]) -> _S: ... + def today(cls: type[_DatetimeT], tz: _tzinfo | str | None = ...) -> _DatetimeT: ... @classmethod - def fromordinal(cls: Type[_S], n: int) -> _S: ... + def fromordinal( + cls: type[_DatetimeT], + ordinal: int, + freq: str | BaseOffset | None = ..., + tz: _tzinfo | str | None = ..., + ) -> _DatetimeT: ... @classmethod - def now(cls: Type[_S], tz: _tzinfo | str | None = ...) -> _S: ... + def now(cls: type[_DatetimeT], tz: _tzinfo | str | None = ...) -> _DatetimeT: ... @classmethod - def utcnow(cls: Type[_S]) -> _S: ... + def utcnow(cls: type[_DatetimeT]) -> _DatetimeT: ... + # error: Signature of "combine" incompatible with supertype "datetime" @classmethod - def combine( - cls, date: _date, time: _time, tzinfo: _tzinfo | None = ... - ) -> datetime: ... + def combine(cls, date: _date, time: _time) -> datetime: ... # type: ignore[override] @classmethod - def fromisoformat(cls: Type[_S], date_string: str) -> _S: ... - def strftime(self, fmt: str) -> str: ... + def fromisoformat(cls: type[_DatetimeT], date_string: str) -> _DatetimeT: ... + def strftime(self, format: str) -> str: ... def __format__(self, fmt: str) -> str: ... def toordinal(self) -> int: ... def timetuple(self) -> struct_time: ... @@ -116,10 +121,9 @@ class Timestamp(datetime): second: int = ..., microsecond: int = ..., tzinfo: _tzinfo | None = ..., - *, fold: int = ..., ) -> datetime: ... - def astimezone(self: _S, tz: _tzinfo | None = ...) -> _S: ... + def astimezone(self: _DatetimeT, tz: _tzinfo | None = ...) -> _DatetimeT: ... def ctime(self) -> str: ... def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... @classmethod @@ -131,12 +135,18 @@ class Timestamp(datetime): def __lt__(self, other: datetime) -> bool: ... # type: ignore def __ge__(self, other: datetime) -> bool: ... # type: ignore def __gt__(self, other: datetime) -> bool: ... # type: ignore - def __add__(self: _S, other: timedelta) -> _S: ... - def __radd__(self: _S, other: timedelta) -> _S: ... + # error: Signature of "__add__" incompatible with supertype "date"/"datetime" + @overload # type: ignore[override] + def __add__(self, other: np.ndarray) -> np.ndarray: ... + @overload + # TODO: other can also be Tick (but it cannot be resolved) + def __add__(self: _DatetimeT, other: timedelta | np.timedelta64) -> _DatetimeT: ... + def __radd__(self: _DatetimeT, other: timedelta) -> _DatetimeT: ... @overload # type: ignore def __sub__(self, other: datetime) -> timedelta: ... @overload - def __sub__(self, other: timedelta) -> datetime: ... + # TODO: other can also be Tick (but it cannot be resolved) + def __sub__(self, other: timedelta | np.timedelta64) -> datetime: ... def __hash__(self) -> int: ... def weekday(self) -> int: ... def isoweekday(self) -> int: ... @@ -157,23 +167,41 @@ class Timestamp(datetime): def is_year_end(self) -> bool: ... def to_pydatetime(self, warn: bool = ...) -> datetime: ... def to_datetime64(self) -> np.datetime64: ... - def to_period(self, freq) -> Period: ... + def to_period(self, freq: BaseOffset | str | None = ...) -> Period: ... def to_julian_date(self) -> np.float64: ... @property def asm8(self) -> np.datetime64: ... - def tz_convert(self: _S, tz) -> _S: ... + def tz_convert(self: _DatetimeT, tz: _tzinfo | str | None) -> _DatetimeT: ... # TODO: could return NaT? def tz_localize( - self: _S, tz, ambiguous: str = ..., nonexistent: str = ... - ) -> _S: ... - def normalize(self: _S) -> _S: ... + self: _DatetimeT, + tz: _tzinfo | str | None, + ambiguous: str = ..., + nonexistent: str = ..., + ) -> _DatetimeT: ... + def normalize(self: _DatetimeT) -> _DatetimeT: ... # TODO: round/floor/ceil could return NaT? def round( - self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ... - ) -> _S: ... + self: _DatetimeT, freq: str, ambiguous: bool | str = ..., nonexistent: str = ... + ) -> _DatetimeT: ... def floor( - self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ... - ) -> _S: ... + self: _DatetimeT, freq: str, ambiguous: bool | str = ..., nonexistent: str = ... + ) -> _DatetimeT: ... def ceil( - self: _S, freq, ambiguous: bool | str = ..., nonexistent: str = ... - ) -> _S: ... + self: _DatetimeT, freq: str, ambiguous: bool | str = ..., nonexistent: str = ... + ) -> _DatetimeT: ... + def day_name(self, locale: str | None = ...) -> str: ... + def month_name(self, locale: str | None = ...) -> str: ... + @property + def day_of_week(self) -> int: ... + @property + def day_of_month(self) -> int: ... + @property + def day_of_year(self) -> int: ... + @property + def quarter(self) -> int: ... + @property + def week(self) -> int: ... + def to_numpy( + self, dtype: np.dtype | None = ..., copy: bool = ... + ) -> np.datetime64: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index f491b5aeedadc..1c26793876e5a 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -307,7 +307,6 @@ cdef class _Timestamp(ABCTimestamp): elif not isinstance(self, _Timestamp): # cython semantics, args have been switched and this is __radd__ return other.__add__(self) - return NotImplemented def __sub__(self, other): diff --git a/pandas/_testing/_hypothesis.py b/pandas/_testing/_hypothesis.py index 0e506f5e878b4..5256a303de34e 100644 --- a/pandas/_testing/_hypothesis.py +++ b/pandas/_testing/_hypothesis.py @@ -44,6 +44,10 @@ min_size=3, ) +OPTIONAL_ONE_OF_ALL = st.one_of( + OPTIONAL_DICTS, OPTIONAL_FLOATS, OPTIONAL_INTS, OPTIONAL_LISTS, OPTIONAL_TEXT +) + if is_platform_windows(): DATETIME_NO_TZ = st.datetimes(min_value=datetime(1900, 1, 1)) else: diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 4d1fb8f33e5ad..e26bb9fb6ebad 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -80,7 +80,8 @@ def _check_comparison_types( f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" ) - if not regex: + if not regex or not should_use_regex(regex, b): + # TODO: should use missing.mask_missing? op = lambda x: operator.eq(x, b) else: op = np.vectorize( diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index f69b9868b10e4..17c5320b1e941 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -220,6 +220,16 @@ def _sparse_array_op( left_sp_values = left.sp_values right_sp_values = right.sp_values + if ( + name in ["floordiv", "mod"] + and (right == 0).any() + and left.dtype.kind in ["i", "u"] + ): + # Match the non-Sparse Series behavior + opname = f"sparse_{name}_float64" + left_sp_values = left_sp_values.astype("float64") + right_sp_values = right_sp_values.astype("float64") + sparse_op = getattr(splib, opname) with np.errstate(all="ignore"): diff --git a/pandas/core/base.py b/pandas/core/base.py index 9040414a8f35f..45a9b92d94b62 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -235,7 +235,7 @@ def __getitem__(self, key): raise IndexError(f"Column(s) {self._selection} already selected") if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)): - if len(self.obj.columns.intersection(key)) != len(key): + if len(self.obj.columns.intersection(key)) != len(set(key)): bad_keys = list(set(key).difference(self.obj.columns)) raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") return self._gotitem(list(key), ndim=2) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 359d89ce664c3..e66086faf53af 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2568,7 +2568,7 @@ def to_json( "primaryKey": [ "index" ], - "pandas_version": "0.20.0" + "pandas_version": "1.4.0" }}, "data": [ {{ diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 986aaa07a913c..48faa1fc46759 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -143,6 +143,7 @@ class OutputKey: "take", "transform", "sample", + "value_counts", ] ) # Valid values of `name` for `groupby.transform(name)` diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4535010b29c3a..9b341845c7170 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -17,6 +17,7 @@ Iterable, Mapping, NamedTuple, + Sequence, TypeVar, Union, cast, @@ -76,6 +77,7 @@ _transform_template, warn_dropping_nuisance_columns_deprecated, ) +from pandas.core.groupby.grouper import get_grouper from pandas.core.indexes.api import ( Index, MultiIndex, @@ -1569,6 +1571,193 @@ def func(df): boxplot = boxplot_frame_groupby + def value_counts( + self, + subset: Sequence[Hashable] | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> DataFrame | Series: + """ + Return a Series or DataFrame containing counts of unique rows. + + .. versionadded:: 1.4.0 + + Parameters + ---------- + subset : list-like, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don’t include counts of rows that contain NA values. + + Returns + ------- + Series or DataFrame + Series if the groupby as_index is True, otherwise DataFrame. + + See Also + -------- + Series.value_counts: Equivalent method on Series. + DataFrame.value_counts: Equivalent method on DataFrame. + SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. + + Notes + ----- + - If the groupby as_index is True then the returned Series will have a + MultiIndex with one level per input column. + - If the groupby as_index is False then the returned DataFrame will have an + additional column with the value_counts. The column is labelled 'count' or + 'proportion', depending on the ``normalize`` parameter. + + By default, rows that contain any NA values are omitted from + the result. + + By default, the result will be in descending order so that the + first element of each group is the most frequently-occurring row. + + Examples + -------- + >>> df = pd.DataFrame({ + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... }) + + >>> df + gender education country + 0 male low US + 1 male medium FR + 2 female high US + 3 male low FR + 4 female high FR + 5 male low FR + + >>> df.groupby('gender').value_counts() + gender education country + female high FR 1 + US 1 + male low FR 2 + US 1 + medium FR 1 + dtype: int64 + + >>> df.groupby('gender').value_counts(ascending=True) + gender education country + female high FR 1 + US 1 + male low US 1 + medium FR 1 + low FR 2 + dtype: int64 + + >>> df.groupby('gender').value_counts(normalize=True) + gender education country + female high FR 0.50 + US 0.50 + male low FR 0.50 + US 0.25 + medium FR 0.25 + dtype: float64 + + >>> df.groupby('gender', as_index=False).value_counts() + gender education country count + 0 female high FR 1 + 1 female high US 1 + 2 male low FR 2 + 3 male low US 1 + 4 male medium FR 1 + + >>> df.groupby('gender', as_index=False).value_counts(normalize=True) + gender education country proportion + 0 female high FR 0.50 + 1 female high US 0.50 + 2 male low FR 0.50 + 3 male low US 0.25 + 4 male medium FR 0.25 + """ + if self.axis == 1: + raise NotImplementedError( + "DataFrameGroupBy.value_counts only handles axis=0" + ) + + with self._group_selection_context(): + df = self.obj + + in_axis_names = { + grouping.name for grouping in self.grouper.groupings if grouping.in_axis + } + if isinstance(self._selected_obj, Series): + name = self._selected_obj.name + keys = [] if name in in_axis_names else [self._selected_obj] + else: + keys = [ + # Can't use .values because the column label needs to be preserved + self._selected_obj.iloc[:, idx] + for idx, name in enumerate(self._selected_obj.columns) + if name not in in_axis_names + ] + + if subset is not None: + clashing = set(subset) & set(in_axis_names) + if clashing: + raise ValueError( + f"Keys {clashing} in subset cannot be in " + "the groupby column keys" + ) + + groupings = list(self.grouper.groupings) + for key in keys: + grouper, _, _ = get_grouper( + df, + key=key, + axis=self.axis, + sort=self.sort, + dropna=dropna, + ) + groupings += list(grouper.groupings) + + # Take the size of the overall columns + gb = df.groupby( + groupings, + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ) + result = cast(Series, gb.size()) + + if normalize: + # Normalize the results by dividing by the original group sizes. + # We are guaranteed to have the first N levels be the + # user-requested grouping. + levels = list(range(len(self.grouper.groupings), result.index.nlevels)) + indexed_group_size = result.groupby( + result.index.droplevel(levels), + sort=self.sort, + observed=self.observed, + dropna=self.dropna, + ).transform("sum") + + result /= indexed_group_size + + if sort: + # Sort the values and then resort by the main grouping + index_level = range(len(self.grouper.groupings)) + result = result.sort_values(ascending=ascending).sort_index( + level=index_level, sort_remaining=False + ) + + if not self.as_index: + # Convert to frame + result = result.reset_index(name="proportion" if normalize else "count") + return result.__finalize__(self.obj, method="value_counts") + def _wrap_transform_general_frame( obj: DataFrame, group: DataFrame, res: DataFrame | Series diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e11d420ada29f..a1866e3bdc9f6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2163,22 +2163,35 @@ def size(self) -> DataFrame | Series: @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum( - self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + self, + numeric_only: bool | lib.NoDefault = lib.no_default, + min_count: int = 0, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, ): - numeric_only = self._resolve_numeric_only(numeric_only) + if maybe_use_numba(engine): + from pandas.core._numba.kernels import sliding_sum - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _agg_general() returns. GH #31422 - with com.temp_setattr(self, "observed", True): - result = self._agg_general( - numeric_only=numeric_only, - min_count=min_count, - alias="add", - npfunc=np.sum, + return self._numba_agg_general( + sliding_sum, + engine_kwargs, + "groupby_sum", ) + else: + numeric_only = self._resolve_numeric_only(numeric_only) - return self._reindex_output(result, fill_value=0) + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _agg_general() returns. GH #31422 + with com.temp_setattr(self, "observed", True): + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="add", + npfunc=np.sum, + ) + + return self._reindex_output(result, fill_value=0) @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index a05f8e581d12f..1e6515084d3b7 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -800,7 +800,7 @@ def get_grouper( # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) - any_groupers = any(isinstance(g, Grouper) for g in keys) + any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys ) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 872516df4cec8..263a046f59121 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4940,7 +4940,8 @@ def __getitem__(self, key): """ getitem = self._data.__getitem__ - if is_scalar(key): + if is_integer(key) or is_float(key): + # GH#44051 exclude bool, which would return a 2d ndarray key = com.cast_scalar_indexer(key, warn_float=True) return getitem(key) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index fdb1ee754a7e6..887c8da6305dd 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1041,7 +1041,11 @@ def _arith_method(self, other, op): rstop = op(left.stop, right) res_name = ops.get_op_result_name(self, other) - result = type(self)(rstart, rstop, rstep, name=res_name) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + # The constructor validation can lead to a DeprecationWarning + # from numpy, e.g. with RangeIndex + np.datetime64("now") + result = type(self)(rstart, rstop, rstep, name=res_name) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9ae2cdc96e02b..3bdc378f86b35 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -640,6 +640,8 @@ def replace( to_replace, value, inplace: bool = False, + # mask may be pre-computed if we're called from replace_list + mask: npt.NDArray[np.bool_] | None = None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -665,7 +667,8 @@ def replace( # replace_list instead of replace. return [self] if inplace else [self.copy()] - mask = missing.mask_missing(values, to_replace) + if mask is None: + mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. @@ -683,6 +686,7 @@ def replace( to_replace=to_replace, value=value, inplace=True, + mask=mask, ) else: @@ -746,16 +750,6 @@ def replace_list( """ values = self.values - # TODO: dont special-case Categorical - if ( - isinstance(values, Categorical) - and len(algos.unique(dest_list)) == 1 - and not regex - ): - # We likely got here by tiling value inside NDFrame.replace, - # so un-tile here - return self.replace(src_list, dest_list[0], inplace) - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -844,25 +838,18 @@ def _replace_coerce( ------- List[Block] """ - if mask.any(): - if not regex: - nb = self.coerce_to_target_dtype(value) - if nb is self and not inplace: - nb = nb.copy() - putmask_inplace(nb.values, mask, value) - return [nb] - else: - regex = should_use_regex(regex, to_replace) - if regex: - return self._replace_regex( - to_replace, - value, - inplace=inplace, - convert=False, - mask=mask, - ) - return self.replace(to_replace, value, inplace=inplace) - return [self] + if should_use_regex(regex, to_replace): + return self._replace_regex( + to_replace, + value, + inplace=inplace, + convert=False, + mask=mask, + ) + else: + return self.replace( + to_replace=to_replace, value=value, inplace=inplace, mask=mask + ) # --------------------------------------------------------------------- diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 62f542de3437f..21d89f18d4959 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -68,8 +68,6 @@ loads = json.loads dumps = json.dumps -TABLE_SCHEMA_VERSION = "0.20.0" - # interface to/from def to_json( @@ -565,7 +563,7 @@ def read_json( {{"name":"col 1","type":"string"}},\ {{"name":"col 2","type":"string"}}],\ "primaryKey":["index"],\ -"pandas_version":"0.20.0"}},\ +"pandas_version":"1.4.0"}},\ "data":[\ {{"index":"row 1","col 1":"a","col 2":"b"}},\ {{"index":"row 2","col 1":"c","col 2":"d"}}]\ diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index ed33784f44464..cb2d426f6b81b 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -18,11 +18,13 @@ JSONSerializable, ) +from pandas.core.dtypes.base import _registry as registry from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype, @@ -40,6 +42,8 @@ loads = json.loads +TABLE_SCHEMA_VERSION = "1.4.0" + def as_json_table_type(x: DtypeObj) -> str: """ @@ -83,6 +87,8 @@ def as_json_table_type(x: DtypeObj) -> str: return "duration" elif is_categorical_dtype(x): return "any" + elif is_extension_array_dtype(x): + return "any" elif is_string_dtype(x): return "string" else: @@ -130,6 +136,8 @@ def convert_pandas_type_to_json_field(arr): field["freq"] = dtype.freq.freqstr elif is_datetime64tz_dtype(dtype): field["tz"] = dtype.tz.zone + elif is_extension_array_dtype(dtype): + field["extDtype"] = dtype.name return field @@ -195,6 +203,8 @@ def convert_json_field_to_pandas_type(field): return CategoricalDtype( categories=field["constraints"]["enum"], ordered=field["ordered"] ) + elif "extDtype" in field: + return registry.find(field["extDtype"]) else: return "object" @@ -253,7 +263,7 @@ def build_table_schema( {'name': 'B', 'type': 'string'}, \ {'name': 'C', 'type': 'datetime'}], \ 'primaryKey': ['idx'], \ -'pandas_version': '0.20.0'} +'pandas_version': '1.4.0'} """ if index is True: data = set_default_names(data) @@ -287,7 +297,7 @@ def build_table_schema( schema["primaryKey"] = primary_key if version: - schema["pandas_version"] = "0.20.0" + schema["pandas_version"] = TABLE_SCHEMA_VERSION return schema diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5d03529654b0d..b769383281880 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -42,7 +42,6 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( ensure_object, - ensure_str, is_bool_dtype, is_categorical_dtype, is_dict_like, @@ -391,22 +390,16 @@ def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) columns = list(zip(*(extract(r) for r in header))) - names = ic + columns - - # If we find unnamed columns all in a single - # level, then our header was too long. - for n in range(len(columns[0])): - if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): - header = ",".join([str(x) for x in self.header]) - raise ParserError( - f"Passed header=[{header}] are too many rows " - "for this multi_index of columns" - ) + names = columns.copy() + for single_ic in sorted(ic): + names.insert(single_ic, single_ic) # Clean the column names (if we have an index_col). if len(ic): col_names = [ - r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None + r[ic[0]] + if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols) + else None for r in header ] else: diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ec20bc49c8a4b..8beacf6828a6b 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -188,12 +188,9 @@ class TestPDApi(Base): # private modules in pandas namespace private_modules = [ "_config", - "_hashtable", - "_lib", "_libs", "_is_numpy_dev", "_testing", - "_tslib", "_typing", "_version", ] diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 55cbfaf76d5a7..01b447aa855a3 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -107,15 +107,15 @@ def numeric_idx(request): @pytest.fixture( params=[ - pd.Timedelta("5m4s").to_pytimedelta(), - pd.Timedelta("5m4s"), - pd.Timedelta("5m4s").to_timedelta64(), + pd.Timedelta("10m7s").to_pytimedelta(), + pd.Timedelta("10m7s"), + pd.Timedelta("10m7s").to_timedelta64(), ], ids=lambda x: type(x).__name__, ) def scalar_td(request): """ - Several variants of Timedelta scalars representing 5 minutes and 4 seconds + Several variants of Timedelta scalars representing 10 minutes and 7 seconds. """ return request.param diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 6a5d88cc8d4a6..8194f47541e4c 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -814,6 +814,9 @@ def test_dt64arr_add_timedeltalike_scalar( result = rng + two_hours tm.assert_equal(result, expected) + result = two_hours + rng + tm.assert_equal(result, expected) + rng += two_hours tm.assert_equal(rng, expected) @@ -834,34 +837,6 @@ def test_dt64arr_sub_timedeltalike_scalar( rng -= two_hours tm.assert_equal(rng, expected) - # TODO: redundant with test_dt64arr_add_timedeltalike_scalar - def test_dt64arr_add_td64_scalar(self, box_with_array): - # scalar timedeltas/np.timedelta64 objects - # operate with np.timedelta64 correctly - ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) - - expected = Series( - [Timestamp("20130101 9:01:01"), Timestamp("20130101 9:02:01")] - ) - - dtarr = tm.box_expected(ser, box_with_array) - expected = tm.box_expected(expected, box_with_array) - - result = dtarr + np.timedelta64(1, "s") - tm.assert_equal(result, expected) - result = np.timedelta64(1, "s") + dtarr - tm.assert_equal(result, expected) - - expected = Series( - [Timestamp("20130101 9:01:00.005"), Timestamp("20130101 9:02:00.005")] - ) - expected = tm.box_expected(expected, box_with_array) - - result = dtarr + np.timedelta64(5, "ms") - tm.assert_equal(result, expected) - result = np.timedelta64(5, "ms") + dtarr - tm.assert_equal(result, expected) - def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture): # GH#23320 special handling for timedelta64("NaT") tz = tz_naive_fixture @@ -918,6 +893,9 @@ def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, box_with_array): Timestamp("2013-01-01"), Timestamp("2013-01-01").to_pydatetime(), Timestamp("2013-01-01").to_datetime64(), + # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano + # for DataFrame operation + np.datetime64("2013-01-01", "D"), ], ) def test_dt64arr_sub_dtscalar(self, box_with_array, ts): @@ -931,25 +909,11 @@ def test_dt64arr_sub_dtscalar(self, box_with_array, ts): result = idx - ts tm.assert_equal(result, expected) - def test_dt64arr_sub_datetime64_not_ns(self, box_with_array): - # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano - # for DataFrame operation - dt64 = np.datetime64("2013-01-01") - assert dt64.dtype == "datetime64[D]" - - dti = date_range("20130101", periods=3)._with_freq(None) - dtarr = tm.box_expected(dti, box_with_array) - - expected = TimedeltaIndex(["0 Days", "1 Day", "2 Days"]) - expected = tm.box_expected(expected, box_with_array) - - result = dtarr - dt64 - tm.assert_equal(result, expected) - - result = dt64 - dtarr + result = ts - idx + tm.assert_equal(result, -expected) tm.assert_equal(result, -expected) - def test_dt64arr_sub_timestamp(self, box_with_array): + def test_dt64arr_sub_timestamp_tzaware(self, box_with_array): ser = date_range("2014-03-17", periods=2, freq="D", tz="US/Eastern") ser = ser._with_freq(None) ts = ser[0] @@ -1024,25 +988,73 @@ def test_dt64arr_aware_sub_dt64ndarray_raises( # ------------------------------------------------------------- # Addition of datetime-like others (invalid) - def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, box_with_array): - + def test_dt64arr_add_dtlike_raises(self, tz_naive_fixture, box_with_array): + # GH#22163 ensure DataFrame doesn't cast Timestamp to i8 + # GH#9631 tz = tz_naive_fixture - dti = date_range("2016-01-01", periods=3, tz=tz) - dt64vals = dti.values + dti = date_range("2016-01-01", periods=3, tz=tz) + if tz is None: + dti2 = dti.tz_localize("US/Eastern") + else: + dti2 = dti.tz_localize(None) dtarr = tm.box_expected(dti, box_with_array) - assert_cannot_add(dtarr, dt64vals) - def test_dt64arr_add_timestamp_raises(self, box_with_array): - # GH#22163 ensure DataFrame doesn't cast Timestamp to i8 - idx = DatetimeIndex(["2011-01-01", "2011-01-02"]) - ts = idx[0] - idx = tm.box_expected(idx, box_with_array) - assert_cannot_add(idx, ts) + assert_cannot_add(dtarr, dti.values) + assert_cannot_add(dtarr, dti) + assert_cannot_add(dtarr, dtarr) + assert_cannot_add(dtarr, dti[0]) + assert_cannot_add(dtarr, dti[0].to_pydatetime()) + assert_cannot_add(dtarr, dti[0].to_datetime64()) + assert_cannot_add(dtarr, dti2[0]) + assert_cannot_add(dtarr, dti2[0].to_pydatetime()) + assert_cannot_add(dtarr, np.datetime64("2011-01-01", "D")) # ------------------------------------------------------------- # Other Invalid Addition/Subtraction + # Note: freq here includes both Tick and non-Tick offsets; this is + # relevant because historically integer-addition was allowed if we had + # a freq. + @pytest.mark.parametrize("freq", ["H", "D", "W", "M", "MS", "Q", "B", None]) + @pytest.mark.parametrize("dtype", [None, "uint8"]) + def test_dt64arr_addsub_intlike( + self, dtype, box_with_array, freq, tz_naive_fixture + ): + # GH#19959, GH#19123, GH#19012 + tz = tz_naive_fixture + if box_with_array is pd.DataFrame: + # alignment headaches + return + + if freq is None: + dti = DatetimeIndex(["NaT", "2017-04-05 06:07:08"], tz=tz) + else: + dti = date_range("2016-01-01", periods=2, freq=freq, tz=tz) + + obj = box_with_array(dti) + other = np.array([4, -1], dtype=dtype) + + msg = "|".join( + [ + "Addition/subtraction of integers", + "cannot subtract DatetimeArray from", + # IntegerArray + "can only perform ops with numeric values", + "unsupported operand type.*Categorical", + ] + ) + assert_invalid_addsub_type(obj, 1, msg) + assert_invalid_addsub_type(obj, np.int64(2), msg) + assert_invalid_addsub_type(obj, np.array(3, dtype=np.int64), msg) + assert_invalid_addsub_type(obj, other, msg) + assert_invalid_addsub_type(obj, np.array(other), msg) + assert_invalid_addsub_type(obj, pd.array(other), msg) + assert_invalid_addsub_type(obj, pd.Categorical(other), msg) + assert_invalid_addsub_type(obj, pd.Index(other), msg) + assert_invalid_addsub_type(obj, pd.core.indexes.api.NumericIndex(other), msg) + assert_invalid_addsub_type(obj, Series(other), msg) + @pytest.mark.parametrize( "other", [ @@ -1101,48 +1113,49 @@ def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixtu obj1 = tm.box_expected(obj1, box_with_array) obj2 = tm.box_expected(obj2, box_with_array) + msg = "|".join( + [ + "unsupported operand", + "cannot subtract DatetimeArray from ndarray", + ] + ) + with warnings.catch_warnings(record=True): # pandas.errors.PerformanceWarning: Non-vectorized DateOffset being # applied to Series or DatetimeIndex # we aren't testing that here, so ignore. warnings.simplefilter("ignore", PerformanceWarning) - # If `x + y` raises, then `y + x` should raise here as well + assert_invalid_addsub_type(obj1, obj2, msg=msg) - msg = ( - r"unsupported operand type\(s\) for -: " - "'(Timestamp|DatetimeArray)' and 'datetime.time'" - ) - with pytest.raises(TypeError, match=msg): - obj1 - obj2 + # ------------------------------------------------------------- + # Other invalid operations - msg = "|".join( - [ - "cannot subtract DatetimeArray from ndarray", - "ufunc (subtract|'subtract') cannot use operands with types " - r"dtype\('O'\) and dtype\(' str: + return self.name + + +class DateArray(ExtensionArray): + def __init__( + self, + dates: Union[ + dt.date, + Sequence[dt.date], + Tuple[np.ndarray, np.ndarray, np.ndarray], + np.ndarray, + ], + ) -> None: + if isinstance(dates, dt.date): + self._year = np.array([dates.year]) + self._month = np.array([dates.month]) + self._day = np.array([dates.year]) + return + + ldates = len(dates) + if isinstance(dates, list): + # pre-allocate the arrays since we know the size before hand + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + # populate them + for i, (y, m, d) in enumerate( + map(lambda date: (date.year, date.month, date.day), dates) + ): + self._year[i] = y + self._month[i] = m + self._day[i] = d + + elif isinstance(dates, tuple): + # only support triples + if ldates != 3: + raise ValueError("only triples are valid") + # check if all elements have the same type + if any(map(lambda x: not isinstance(x, np.ndarray), dates)): + raise TypeError("invalid type") + ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates) + if not ly == lm == ld: + raise ValueError( + f"tuple members must have the same length: {(ly, lm, ld)}" + ) + self._year = dates[0].astype(np.uint16) + self._month = dates[1].astype(np.uint8) + self._day = dates[2].astype(np.uint8) + + elif isinstance(dates, np.ndarray) and dates.dtype == "U10": + self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999) + self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) + self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) + + for (i,), (y, m, d) in np.ndenumerate(np.char.split(dates, sep="-")): + self._year[i] = int(y) + self._month[i] = int(m) + self._day[i] = int(d) + + else: + raise TypeError(f"{type(dates)} is not supported") + + @property + def dtype(self) -> ExtensionDtype: + return DateDtype() + + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + + if isinstance(dtype, DateDtype): + data = self.copy() if copy else self + else: + data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min) + + return data + + @property + def nbytes(self) -> int: + return self._year.nbytes + self._month.nbytes + self._day.nbytes + + def __len__(self) -> int: + return len(self._year) # all 3 arrays are enforced to have the same length + + def __getitem__(self, item: PositionalIndexer): + if isinstance(item, int): + return dt.date(self._year[item], self._month[item], self._day[item]) + else: + raise NotImplementedError("only ints are supported as indexes") + + def __setitem__(self, key: Union[int, slice, np.ndarray], value: Any): + if not isinstance(key, int): + raise NotImplementedError("only ints are supported as indexes") + + if not isinstance(value, dt.date): + raise TypeError("you can only set datetime.date types") + + self._year[key] = value.year + self._month[key] = value.month + self._day[key] = value.day + + def __repr__(self) -> str: + return f"DateArray{list(zip(self._year, self._month, self._day))}" + + def copy(self) -> "DateArray": + return DateArray((self._year.copy(), self._month.copy(), self._day.copy())) + + def isna(self) -> np.ndarray: + return np.logical_and( + np.logical_and( + self._year == dt.date.min.year, self._month == dt.date.min.month + ), + self._day == dt.date.min.day, + ) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False): + if isinstance(scalars, dt.date): + pass + elif isinstance(scalars, DateArray): + pass + elif isinstance(scalars, np.ndarray): + scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd + return DateArray(scalars) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 062ab9bc2b4d7..e58e26fafdc1b 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -67,8 +67,11 @@ class DecimalArray(OpsMixin, ExtensionScalarOpsMixin, ExtensionArray): def __init__(self, values, dtype=None, copy=False, context=None): for i, val in enumerate(values): - if is_float(val) and np.isnan(val): - values[i] = DecimalDtype.na_value + if is_float(val): + if np.isnan(val): + values[i] = DecimalDtype.na_value + else: + values[i] = DecimalDtype.type(val) elif not isinstance(val, decimal.Decimal): raise TypeError("All values must be of type " + str(decimal.Decimal)) values = np.asarray(values, dtype=object) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index a079a3a7921d7..e668e77644082 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -2,7 +2,7 @@ from hypothesis import ( given, - strategies as st, + settings, ) import numpy as np import pytest @@ -22,13 +22,7 @@ isna, ) import pandas._testing as tm -from pandas._testing._hypothesis import ( - OPTIONAL_DICTS, - OPTIONAL_FLOATS, - OPTIONAL_INTS, - OPTIONAL_LISTS, - OPTIONAL_TEXT, -) +from pandas._testing._hypothesis import OPTIONAL_ONE_OF_ALL @pytest.fixture(params=["default", "float_string", "mixed_float", "mixed_int"]) @@ -874,11 +868,8 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): obj.mask(mask, null) -@given( - data=st.one_of( - OPTIONAL_DICTS, OPTIONAL_FLOATS, OPTIONAL_INTS, OPTIONAL_LISTS, OPTIONAL_TEXT - ) -) +@given(data=OPTIONAL_ONE_OF_ALL) +@settings(deadline=None) # GH 44969 def test_where_inplace_casting(data): # GH 22051 df = DataFrame({"a": data}) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 65bd2207c00a7..a8148b6746aca 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -624,6 +624,14 @@ def test_replace_mixed3(self): expected.iloc[1, 1] = m[1] tm.assert_frame_equal(result, expected) + def test_replace_nullable_int_with_string_doesnt_cast(self): + # GH#25438 don't cast df['a'] to float64 + df = DataFrame({"a": [1, 2, 3, np.nan], "b": ["some", "strings", "here", "he"]}) + df["a"] = df["a"].astype("Int64") + + res = df.replace("", np.nan) + tm.assert_series_equal(res["a"], df["a"]) + @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"]) def test_replace_with_nullable_column(self, dtype): # GH-44499 @@ -1382,15 +1390,12 @@ def test_replace_value_category_type(self): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - reason="category dtype gets changed to object type after replace, see #35268", - raises=AssertionError, - ) def test_replace_dict_category_type(self): """ Test to ensure category dtypes are maintained after replace with dict values """ + # GH#35268, GH#44940 # create input dataframe input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]} diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 596e9e0a4de77..2be680d7a4ccd 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -183,6 +183,7 @@ def nopython(request): ("var", {"ddof": 0}), ("std", {"ddof": 1}), ("std", {"ddof": 0}), + ("sum", {}), ] ) def numba_supported_reductions(request): diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index 9d9a2e39e06c7..44778aafdf75f 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -319,6 +319,7 @@ def test_tab_completion(mframe): "pipe", "sample", "ewm", + "value_counts", } assert results == expected diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py new file mode 100644 index 0000000000000..79ef46db8e95e --- /dev/null +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -0,0 +1,444 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, +) +import pandas._testing as tm + + +@pytest.fixture +def education_df(): + return DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + + +def test_axis(education_df): + gp = education_df.groupby("country", axis=1) + with pytest.raises(NotImplementedError, match="axis"): + gp.value_counts() + + +def test_bad_subset(education_df): + gp = education_df.groupby("country") + with pytest.raises(ValueError, match="subset"): + gp.value_counts(subset=["country"]) + + +def test_basic(education_df): + # gh43564 + result = education_df.groupby("country")[["gender", "education"]].value_counts( + normalize=True + ) + expected = Series( + data=[0.5, 0.25, 0.25, 0.5, 0.5], + index=MultiIndex.from_tuples( + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ], + names=["country", "gender", "education"], + ), + ) + tm.assert_series_equal(result, expected) + + +def _frame_value_counts(df, keys, normalize, sort, ascending): + return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) + + +@pytest.mark.parametrize("groupby", ["column", "array", "function"]) +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "sort, ascending", + [ + (False, None), + (True, True), + (True, False), + ], +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_against_frame_and_seriesgroupby( + education_df, groupby, normalize, sort, ascending, as_index, frame +): + # test all parameters: + # - Use column, array or function as by= parameter + # - Whether or not to normalize + # - Whether or not to sort and how + # - Whether or not to use the groupby as an index + # - 3-way compare against: + # - apply with :meth:`~DataFrame.value_counts` + # - `~SeriesGroupBy.value_counts` + by = { + "column": "country", + "array": education_df["country"].values, + "function": lambda x: education_df["country"][x] == "US", + }[groupby] + + gp = education_df.groupby(by=by, as_index=as_index) + result = gp[["gender", "education"]].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + if frame: + # compare against apply with DataFrame value_counts + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) + + if as_index: + tm.assert_series_equal(result, expected) + else: + name = "proportion" if normalize else "count" + expected = expected.reset_index().rename({0: name}, axis=1) + if groupby == "column": + expected = expected.rename({"level_0": "country"}, axis=1) + expected["country"] = np.where(expected["country"], "US", "FR") + elif groupby == "function": + expected["level_0"] = expected["level_0"] == 1 + else: + expected["level_0"] = np.where(expected["level_0"], "US", "FR") + tm.assert_frame_equal(result, expected) + else: + # compare against SeriesGroupBy value_counts + education_df["both"] = education_df["gender"] + "-" + education_df["education"] + expected = gp["both"].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + expected.name = None + if as_index: + index_frame = expected.index.to_frame(index=False) + index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) + index_frame["education"] = index_frame["both"].str.split("-").str.get(1) + del index_frame["both"] + index_frame = index_frame.rename({0: None}, axis=1) + expected.index = MultiIndex.from_frame(index_frame) + tm.assert_series_equal(result, expected) + else: + expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) + expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) + del expected["both"] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "sort, ascending, expected_rows, expected_count, expected_group_size", + [ + (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]), + (True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]), + (True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]), + ], +) +def test_compound( + education_df, + normalize, + sort, + ascending, + expected_rows, + expected_count, + expected_group_size, +): + # Multiple groupby keys and as_index=False + gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) + result = gp["education"].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + expected = DataFrame() + for column in ["country", "gender", "education"]: + expected[column] = [education_df[column][row] for row in expected_rows] + if normalize: + expected["proportion"] = expected_count + expected["proportion"] /= expected_group_size + else: + expected["count"] = expected_count + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def animals_df(): + return DataFrame( + {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + +@pytest.mark.parametrize( + "sort, ascending, normalize, expected_data, expected_index", + [ + (False, None, False, [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]), + (True, True, False, [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]), + (True, False, False, [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), + (True, False, True, [0.5, 0.25, 0.25], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), + ], +) +def test_data_frame_value_counts( + animals_df, sort, ascending, normalize, expected_data, expected_index +): + # 3-way compare with :meth:`~DataFrame.value_counts` + # Tests from frame/methods/test_value_counts.py + result_frame = animals_df.value_counts( + sort=sort, ascending=ascending, normalize=normalize + ) + expected = Series( + data=expected_data, + index=MultiIndex.from_arrays( + expected_index, names=["key", "num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result_frame, expected) + + result_frame_groupby = animals_df.groupby("key").value_counts( + sort=sort, ascending=ascending, normalize=normalize + ) + + tm.assert_series_equal(result_frame_groupby, expected) + + +@pytest.fixture +def nulls_df(): + n = np.nan + return DataFrame( + { + "A": [1, 1, n, 4, n, 6, 6, 6, 6], + "B": [1, 1, 3, n, n, 6, 6, 6, 6], + "C": [1, 2, 3, 4, 5, 6, n, 8, n], + "D": [1, 2, 3, 4, 5, 6, 7, n, n], + } + ) + + +@pytest.mark.parametrize( + "group_dropna, count_dropna, expected_rows, expected_values", + [ + ( + False, + False, + [0, 1, 3, 5, 7, 6, 8, 2, 4], + [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], + ), + (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), + (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), + (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), + ], +) +def test_dropna_combinations( + nulls_df, group_dropna, count_dropna, expected_rows, expected_values +): + gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) + result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) + columns = DataFrame() + for column in nulls_df.columns: + columns[column] = [nulls_df[column][row] for row in expected_rows] + index = MultiIndex.from_frame(columns) + expected = Series(data=expected_values, index=index) + tm.assert_series_equal(result, expected) + + +@pytest.fixture +def names_with_nulls_df(nulls_fixture): + return DataFrame( + { + "key": [1, 1, 1, 1], + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + + +@pytest.mark.parametrize( + "dropna, expected_data, expected_index", + [ + ( + True, + [1, 1], + MultiIndex.from_arrays( + [(1, 1), ("Beth", "John"), ("Louise", "Smith")], + names=["key", "first_name", "middle_name"], + ), + ), + ( + False, + [1, 1, 1, 1], + MultiIndex( + levels=[ + Index([1]), + Index(["Anne", "Beth", "John"]), + Index(["Louise", "Smith", np.nan]), + ], + codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], + names=["key", "first_name", "middle_name"], + ), + ), + ], +) +@pytest.mark.parametrize("normalize", [False, True]) +def test_data_frame_value_counts_dropna( + names_with_nulls_df, dropna, normalize, expected_data, expected_index +): + # GH 41334 + # 3-way compare with :meth:`~DataFrame.value_counts` + # Tests with nulls from frame/methods/test_value_counts.py + result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize) + expected = Series( + data=expected_data, + index=expected_index, + ) + if normalize: + expected /= float(len(expected_data)) + + tm.assert_series_equal(result_frame, expected) + + result_frame_groupby = names_with_nulls_df.groupby("key").value_counts( + dropna=dropna, normalize=normalize + ) + + tm.assert_series_equal(result_frame_groupby, expected) + + +@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize( + "observed, expected_index", + [ + ( + False, + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ], + ), + ( + True, + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ], + ), + ], +) +@pytest.mark.parametrize( + "normalize, expected_data", + [ + (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)), + ( + True, + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical( + education_df, as_index, observed, expected_index, normalize, expected_data +): + # Test categorical data whether or not observed + gp = education_df.astype("category").groupby( + "country", as_index=as_index, observed=observed + ) + result = gp.value_counts(normalize=normalize) + + expected_series = Series( + data=expected_data[expected_data > 0.0] if observed else expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "gender", "education"], + ), + ) + for i in range(3): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "normalize, expected_label, expected_values", + [ + (False, "count", [1, 1, 1]), + (True, "proportion", [0.5, 0.5, 1.0]), + ], +) +def test_mixed_groupings(normalize, expected_label, expected_values): + # Test multiple groupings + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False) + result = gp.value_counts(sort=True, normalize=normalize) + expected = DataFrame( + { + "level_0": [4, 4, 5], + "A": [1, 1, 2], + "level_2": [8, 8, 7], + "B": [1, 3, 2], + expected_label: expected_values, + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "test, expected_names", + [ + ("repeat", ["a", None, "d", "b", "b", "e"]), + ("level", ["a", None, "d", "b", "c", "level_1"]), + ], +) +@pytest.mark.parametrize("as_index", [False, True]) +def test_column_name_clashes(test, expected_names, as_index): + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8], "e": [9, 10]}) + if test == "repeat": + df.columns = list("abbde") + else: + df.columns = list("abcd") + ["level_1"] + + if as_index: + result = df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() + expected = Series( + data=(1, 1), + index=MultiIndex.from_tuples( + [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)], + names=expected_names, + ), + ) + tm.assert_series_equal(result, expected) + else: + with pytest.raises(ValueError, match="cannot insert"): + df.groupby(["a", [0, 1], "d"], as_index=as_index).value_counts() + + +def test_ambiguous_grouping(): + # Test that groupby is not confused by groupings length equal to row count + df = DataFrame({"a": [1, 1]}) + gb = df.groupby([1, 1]) + result = gb.value_counts() + expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"])) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_indexing.py b/pandas/tests/groupby/test_indexing.py index b9f71fd4ed96a..aea659445801b 100644 --- a/pandas/tests/groupby/test_indexing.py +++ b/pandas/tests/groupby/test_indexing.py @@ -2,6 +2,7 @@ import random +import numpy as np import pytest import pandas as pd @@ -285,3 +286,20 @@ def test_column_axis(column_group_df): expected = column_group_df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array]) +def test_groupby_duplicated_columns(func): + # GH#44924 + df = pd.DataFrame( + { + "A": [1, 2], + "B": [3, 3], + "C": ["G", "G"], + } + ) + result = df.groupby("C")[func(["A", "B", "A"])].mean() + expected = pd.DataFrame( + [[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C") + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 6fc1f0d808bb2..6554993c140a1 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -24,7 +24,9 @@ def test_cython_vs_numba_frame( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) expected = getattr(gb, func)(**kwargs) - tm.assert_frame_equal(result, expected) + # check_dtype can be removed if GH 44952 is addressed + check_dtype = func != "sum" + tm.assert_frame_equal(result, expected, check_dtype=check_dtype) def test_cython_vs_numba_getitem( self, sort, nogil, parallel, nopython, numba_supported_reductions @@ -37,7 +39,9 @@ def test_cython_vs_numba_getitem( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) expected = getattr(gb, func)(**kwargs) - tm.assert_series_equal(result, expected) + # check_dtype can be removed if GH 44952 is addressed + check_dtype = func != "sum" + tm.assert_series_equal(result, expected, check_dtype=check_dtype) def test_cython_vs_numba_series( self, sort, nogil, parallel, nopython, numba_supported_reductions @@ -50,7 +54,9 @@ def test_cython_vs_numba_series( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) expected = getattr(gb, func)(**kwargs) - tm.assert_series_equal(result, expected) + # check_dtype can be removed if GH 44952 is addressed + check_dtype = func != "sum" + tm.assert_series_equal(result, expected, check_dtype=check_dtype) def test_as_index_false_unsupported(self, numba_supported_reductions): func, kwargs = numba_supported_reductions diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 9dee9fe288ed8..c60c74479f8b6 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -670,6 +670,21 @@ def test_getitem_2d_deprecated(self, simple_index): assert isinstance(res, np.ndarray), type(res) + if not isinstance(idx, RangeIndex): + # GH#44051 RangeIndex already raises + with tm.assert_produces_warning(FutureWarning, match=msg): + res = idx[True] + assert isinstance(res, np.ndarray), type(res) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = idx[False] + assert isinstance(res, np.ndarray), type(res) + else: + msg = "only integers, slices" + with pytest.raises(IndexError, match=msg): + idx[True] + with pytest.raises(IndexError, match=msg): + idx[False] + def test_copy_shares_cache(self, simple_index): # GH32898, GH36840 idx = simple_index diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index 5418f3a5964d9..c44303aa2c862 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -63,3 +63,9 @@ def test_getitem_2d_deprecated(self, simple_index): with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): with tm.assert_produces_warning(FutureWarning): idx[:, None] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[True] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[False] diff --git a/pandas/tests/indexes/period/methods/test_to_timestamp.py b/pandas/tests/indexes/period/methods/test_to_timestamp.py index c2328872aee1b..164ed3ec43996 100644 --- a/pandas/tests/indexes/period/methods/test_to_timestamp.py +++ b/pandas/tests/indexes/period/methods/test_to_timestamp.py @@ -16,6 +16,36 @@ class TestToTimestamp: + def test_to_timestamp_non_contiguous(self): + # GH#44100 + dti = date_range("2021-10-18", periods=9, freq="B") + pi = dti.to_period() + + result = pi[::2].to_timestamp() + expected = dti[::2] + tm.assert_index_equal(result, expected) + + result = pi._data[::2].to_timestamp() + expected = dti._data[::2] + # TODO: can we get the freq to round-trip? + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + result = pi[::-1].to_timestamp() + expected = dti[::-1] + tm.assert_index_equal(result, expected) + + result = pi._data[::-1].to_timestamp() + expected = dti._data[::-1] + tm.assert_datetime_array_equal(result, expected, check_freq=False) + + result = pi[::2][::-1].to_timestamp() + expected = dti[::2][::-1] + tm.assert_index_equal(result, expected) + + result = pi._data[::2][::-1].to_timestamp() + expected = dti._data[::2][::-1] + tm.assert_datetime_array_equal(result, expected, check_freq=False) + def test_to_timestamp_freq(self): idx = period_range("2017", periods=12, freq="A-DEC") result = idx.to_timestamp() diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 2366dd39c25f2..014f0f5933387 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1177,6 +1177,7 @@ def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer): assert obj.dtype == from_key result = obj.replace(replacer) + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key @@ -1197,7 +1198,21 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) + warn = None + rep_ser = pd.Series(replacer) + if ( + isinstance(obj.dtype, pd.DatetimeTZDtype) + and isinstance(rep_ser.dtype, pd.DatetimeTZDtype) + and obj.dtype != rep_ser.dtype + ): + # mismatched tz DatetimeArray behavior will change to cast + # for setitem-like methods with mismatched tzs GH#44940 + warn = FutureWarning + + msg = "explicitly cast to object" + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 199c4e64f18fd..ef313b2840107 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -494,7 +494,6 @@ def test_copy(self, mgr): def test_sparse(self): mgr = create_mgr("a: sparse-1; b: sparse-2") - # what to test here? assert mgr.as_array().dtype == np.float64 def test_sparse_mixed(self): @@ -502,8 +501,6 @@ def test_sparse_mixed(self): assert len(mgr.blocks) == 3 assert isinstance(mgr, BlockManager) - # TODO: what to test here? - @pytest.mark.parametrize( "mgr_string, dtype", [("c: f4; d: f2", np.float32), ("c: f4; d: f2; e: f8", np.float64)], diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index a61e77bec9828..aa8508d8e8942 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -452,43 +452,59 @@ def test_to_html_invalid_justify(justify): df.to_html(justify=justify) -def test_to_html_index(datapath): - # TODO: split this test - index = ["foo", "bar", "baz"] - df = DataFrame( - {"A": [1, 2, 3], "B": [1.2, 3.4, 5.6], "C": ["one", "two", np.nan]}, - columns=["A", "B", "C"], - index=index, - ) - expected_with_index = expected_html(datapath, "index_1") - assert df.to_html() == expected_with_index - - expected_without_index = expected_html(datapath, "index_2") - result = df.to_html(index=False) - for i in index: - assert i not in result - assert result == expected_without_index - df.index = Index(["foo", "bar", "baz"], name="idx") - expected_with_index = expected_html(datapath, "index_3") - assert df.to_html() == expected_with_index - assert df.to_html(index=False) == expected_without_index - - tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] - df.index = MultiIndex.from_tuples(tuples) - - expected_with_index = expected_html(datapath, "index_4") - assert df.to_html() == expected_with_index +class TestHTMLIndex: + @pytest.fixture + def df(self): + index = ["foo", "bar", "baz"] + df = DataFrame( + {"A": [1, 2, 3], "B": [1.2, 3.4, 5.6], "C": ["one", "two", np.nan]}, + columns=["A", "B", "C"], + index=index, + ) + return df - result = df.to_html(index=False) - for i in ["foo", "bar", "car", "bike"]: - assert i not in result - # must be the same result as normal index - assert result == expected_without_index - - df.index = MultiIndex.from_tuples(tuples, names=["idx1", "idx2"]) - expected_with_index = expected_html(datapath, "index_5") - assert df.to_html() == expected_with_index - assert df.to_html(index=False) == expected_without_index + @pytest.fixture + def expected_without_index(self, datapath): + return expected_html(datapath, "index_2") + + def test_to_html_flat_index_without_name( + self, datapath, df, expected_without_index + ): + expected_with_index = expected_html(datapath, "index_1") + assert df.to_html() == expected_with_index + + result = df.to_html(index=False) + for i in df.index: + assert i not in result + assert result == expected_without_index + + def test_to_html_flat_index_with_name(self, datapath, df, expected_without_index): + df.index = Index(["foo", "bar", "baz"], name="idx") + expected_with_index = expected_html(datapath, "index_3") + assert df.to_html() == expected_with_index + assert df.to_html(index=False) == expected_without_index + + def test_to_html_multiindex_without_names( + self, datapath, df, expected_without_index + ): + tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] + df.index = MultiIndex.from_tuples(tuples) + + expected_with_index = expected_html(datapath, "index_4") + assert df.to_html() == expected_with_index + + result = df.to_html(index=False) + for i in ["foo", "bar", "car", "bike"]: + assert i not in result + # must be the same result as normal index + assert result == expected_without_index + + def test_to_html_multiindex_with_names(self, datapath, df, expected_without_index): + tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] + df.index = MultiIndex.from_tuples(tuples, names=["idx1", "idx2"]) + expected_with_index = expected_html(datapath, "index_5") + assert df.to_html() == expected_with_index + assert df.to_html(index=False) == expected_without_index @pytest.mark.parametrize("classes", ["sortable draggable", ["sortable", "draggable"]]) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 89b8783462f7e..b204d3bb97b6e 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -185,14 +185,11 @@ def test_as_json_table_type_date_dtypes(self, date_dtype): def test_as_json_table_type_timedelta_dtypes(self, td_dtype): assert as_json_table_type(td_dtype) == "duration" - @pytest.mark.parametrize("str_dtype", [object]) # TODO + @pytest.mark.parametrize("str_dtype", [object]) # TODO(GH#14904) flesh out dtypes? def test_as_json_table_type_string_dtypes(self, str_dtype): assert as_json_table_type(str_dtype) == "string" def test_as_json_table_type_categorical_dtypes(self): - # TODO: I think before is_categorical_dtype(Categorical) - # returned True, but now it's False. Figure out why or - # if it matters assert as_json_table_type(pd.Categorical(["a"]).dtype) == "any" assert as_json_table_type(CategoricalDtype()) == "any" diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py new file mode 100644 index 0000000000000..3daac204aa730 --- /dev/null +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -0,0 +1,265 @@ +"""Tests for ExtensionDtype Table Schema integration.""" + +from collections import OrderedDict +import datetime as dt +import decimal +import json + +import pytest + +from pandas import ( + DataFrame, + array, +) +from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.string_ import StringDtype +from pandas.core.series import Series +from pandas.tests.extension.date import ( + DateArray, + DateDtype, +) +from pandas.tests.extension.decimal.array import ( + DecimalArray, + DecimalDtype, +) + +from pandas.io.json._table_schema import ( + as_json_table_type, + build_table_schema, +) + + +class TestBuildSchema: + def setup_method(self, method): + self.da = DateArray([dt.date(2021, 10, 10)]) + self.dc = DecimalArray([decimal.Decimal(10)]) + self.sa = array(["pandas"], dtype="string") + self.ia = array([10], dtype="Int64") + self.df = DataFrame( + { + "A": self.da, + "B": self.dc, + "C": self.sa, + "D": self.ia, + } + ) + + def test_build_table_schema(self): + result = build_table_schema(self.df, version=False) + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "A", "type": "any", "extDtype": "DateDtype"}, + {"name": "B", "type": "any", "extDtype": "decimal"}, + {"name": "C", "type": "any", "extDtype": "string"}, + {"name": "D", "type": "integer", "extDtype": "Int64"}, + ], + "primaryKey": ["index"], + } + assert result == expected + result = build_table_schema(self.df) + assert "pandas_version" in result + + +class TestTableSchemaType: + @pytest.mark.parametrize( + "date_data", + [ + DateArray([dt.date(2021, 10, 10)]), + DateArray(dt.date(2021, 10, 10)), + Series(DateArray(dt.date(2021, 10, 10))), + ], + ) + def test_as_json_table_type_ext_date_array_dtype(self, date_data): + assert as_json_table_type(date_data.dtype) == "any" + + def test_as_json_table_type_ext_date_dtype(self): + assert as_json_table_type(DateDtype()) == "any" + + @pytest.mark.parametrize( + "decimal_data", + [ + DecimalArray([decimal.Decimal(10)]), + Series(DecimalArray([decimal.Decimal(10)])), + ], + ) + def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data): + assert as_json_table_type(decimal_data.dtype) == "any" + + def test_as_json_table_type_ext_decimal_dtype(self): + assert as_json_table_type(DecimalDtype()) == "any" + + @pytest.mark.parametrize( + "string_data", + [ + array(["pandas"], dtype="string"), + Series(array(["pandas"], dtype="string")), + ], + ) + def test_as_json_table_type_ext_string_array_dtype(self, string_data): + assert as_json_table_type(string_data.dtype) == "any" + + def test_as_json_table_type_ext_string_dtype(self): + assert as_json_table_type(StringDtype()) == "any" + + @pytest.mark.parametrize( + "integer_data", + [ + array([10], dtype="Int64"), + Series(array([10], dtype="Int64")), + ], + ) + def test_as_json_table_type_ext_integer_array_dtype(self, integer_data): + assert as_json_table_type(integer_data.dtype) == "integer" + + def test_as_json_table_type_ext_integer_dtype(self): + assert as_json_table_type(Int64Dtype()) == "integer" + + +class TestTableOrient: + def setup_method(self, method): + self.da = DateArray([dt.date(2021, 10, 10)]) + self.dc = DecimalArray([decimal.Decimal(10)]) + self.sa = array(["pandas"], dtype="string") + self.ia = array([10], dtype="Int64") + self.df = DataFrame( + { + "A": self.da, + "B": self.dc, + "C": self.sa, + "D": self.ia, + } + ) + + def test_build_date_series(self): + s = Series(self.da, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "any", "extDtype": "DateDtype"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000Z")])]), + ] + ) + + assert result == expected + + def test_build_decimal_series(self): + s = Series(self.dc, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "any", "extDtype": "decimal"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", 10.0)])]), + ] + ) + + assert result == expected + + def test_build_string_series(self): + s = Series(self.sa, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "any", "extDtype": "string"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", "pandas")])]), + ] + ) + + assert result == expected + + def test_build_int64_series(self): + s = Series(self.ia, name="a") + s.index.name = "id" + result = s.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + {"name": "id", "type": "integer"}, + {"name": "a", "type": "integer", "extDtype": "Int64"}, + ] + + schema = {"fields": fields, "primaryKey": ["id"]} + + expected = OrderedDict( + [ + ("schema", schema), + ("data", [OrderedDict([("id", 0), ("a", 10)])]), + ] + ) + + assert result == expected + + def test_to_json(self): + df = self.df.copy() + df.index.name = "idx" + result = df.to_json(orient="table", date_format="iso") + result = json.loads(result, object_pairs_hook=OrderedDict) + + assert "pandas_version" in result["schema"] + result["schema"].pop("pandas_version") + + fields = [ + OrderedDict({"name": "idx", "type": "integer"}), + OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}), + OrderedDict({"name": "B", "type": "any", "extDtype": "decimal"}), + OrderedDict({"name": "C", "type": "any", "extDtype": "string"}), + OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}), + ] + + schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]}) + data = [ + OrderedDict( + [ + ("idx", 0), + ("A", "2021-10-10T00:00:00.000Z"), + ("B", 10.0), + ("C", "pandas"), + ("D", 10), + ] + ) + ] + expected = OrderedDict([("schema", schema), ("data", data)]) + + assert result == expected diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 10eb3526c80ff..1cfda5c096fba 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1673,7 +1673,7 @@ def test_to_json_indent(self, indent): "primaryKey":[ "index" ], - "pandas_version":"0.20.0" + "pandas_version":"1.4.0" }, "data":[ { diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index 0855a469ae58d..4a8f734a34abf 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -28,7 +28,6 @@ @tm.network def test_url(all_parsers, csv_dir_path): - # TODO: FTP testing parser = all_parsers kwargs = {"sep": "\t"} diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index b0742f5b41a92..3fc23525df89e 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -557,26 +557,21 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): else: data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n" + result = parser.read_csv(StringIO(data), header=header, index_col=index_col) + exp_columns = [] + if columns is None: - msg = ( - r"Passed header=\[0,1\] are too " - r"many rows for this multi_index of columns" - ) - with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=header, index_col=index_col) - else: - result = parser.read_csv(StringIO(data), header=header, index_col=index_col) - exp_columns = [] + columns = ["", "", ""] - for i, col in enumerate(columns): - if not col: # Unnamed. - col = f"Unnamed: {i if index_col is None else i + 1}_level_0" + for i, col in enumerate(columns): + if not col: # Unnamed. + col = f"Unnamed: {i if index_col is None else i + 1}_level_0" - exp_columns.append(col) + exp_columns.append(col) - columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) - expected = DataFrame([[2, 3], [4, 5]], columns=columns) - tm.assert_frame_equal(result, expected) + columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) + expected = DataFrame([[2, 3], [4, 5]], columns=columns) + tm.assert_frame_equal(result, expected) @skip_pyarrow diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 58b5eebbec344..f30aba3db917e 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -332,3 +332,23 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val): result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) expected = DataFrame({"b": [2]}, index=Index([val], name="a")) tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_multiindex_columns_not_leading_index_col(all_parsers): + # GH#38549 + parser = all_parsers + data = """a,b,c,d +e,f,g,h +x,y,1,2 +""" + result = parser.read_csv( + StringIO(data), + header=[0, 1], + index_col=1, + ) + cols = MultiIndex.from_tuples( + [("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"] + ) + expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index d0080273537bb..f35caf38c847f 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -174,6 +174,28 @@ def test_usecols_implicit_index_col(all_parsers): tm.assert_frame_equal(result, expected) +def test_usecols_index_col_middle(all_parsers): + # GH#9098 + parser = all_parsers + data = """a,b,c,d +1,2,3,4 +""" + result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c") + expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c")) + tm.assert_frame_equal(result, expected) + + +def test_usecols_index_col_end(all_parsers): + # GH#9098 + parser = all_parsers + data = """a,b,c,d +1,2,3,4 +""" + result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d") + expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d")) + tm.assert_frame_equal(result, expected) + + def test_usecols_regex_sep(all_parsers): # see gh-2733 parser = all_parsers diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 83c86d4da05e6..cbca8bb64e350 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -972,7 +972,8 @@ def test_columns_multiindex_modified(setup_path): ) cols2load = list("BCD") cols2load_original = list(cols2load) - df_loaded = read_hdf(path, "df", columns=cols2load) # noqa + # GH#10055 make sure read_hdf call does not alter cols2load inplace + read_hdf(path, "df", columns=cols2load) assert cols2load_original == cols2load diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index d4b78d8371ede..3aac7e95e6591 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -14,7 +14,6 @@ import pytest from pandas.compat import is_platform_windows -from pandas.errors import ParserError import pandas.util._test_decorators as td from pandas import ( @@ -918,13 +917,8 @@ def test_wikipedia_states_multiindex(self, datapath): assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04) def test_parser_error_on_empty_header_row(self): - msg = ( - r"Passed header=\[0,1\] are too many " - r"rows for this multi_index of columns" - ) - with pytest.raises(ParserError, match=msg): - self.read_html( - """ + result = self.read_html( + """ @@ -935,8 +929,15 @@ def test_parser_error_on_empty_header_row(self):
""", - header=[0, 1], - ) + header=[0, 1], + ) + expected = DataFrame( + [["a", "b"]], + columns=MultiIndex.from_tuples( + [("Unnamed: 0_level_0", "A"), ("Unnamed: 1_level_0", "B")] + ), + ) + tm.assert_frame_equal(result[0], expected) def test_decimal_rows(self): # GH 12907 diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 92a53a443b217..cb8ee4891a41e 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2128,7 +2128,7 @@ def test_get_engine_auto_error_message(self): # Expect different error messages from get_engine(engine="auto") # if engines aren't installed vs. are installed but bad version pass - # TODO fill this in when we add more engines + # TODO(GH#36893) fill this in when we add more engines class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 0d1bb05c27564..2f1ae5df0d5d4 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -43,15 +43,14 @@ def test_dt64_mean(self, tz_naive_fixture, box): assert obj.mean(skipna=False) is pd.NaT @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) - def test_period_mean(self, box): + @pytest.mark.parametrize("freq", ["S", "H", "D", "W", "B"]) + def test_period_mean(self, box, freq): # GH#24757 dti = pd.date_range("2001-01-01", periods=11) # shuffle so that we are not just working with monotone-increasing dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) - # use hourly frequency to avoid rounding errors in expected results - # TODO: flesh this out with different frequencies - parr = dti._data.to_period("H") + parr = dti._data.to_period(freq) obj = box(parr) with pytest.raises(TypeError, match="ambiguous"): obj.mean() diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index b8b254e786194..d5d86465dd91b 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -10,48 +10,50 @@ ) import pandas._testing as tm +dt_data = [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), +] +tz_data = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), +] +td_data = [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), +] +period_data = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), +] +data_dict = { + "bool": [True, False, True], + "int64": [1, 2, 3], + "float64": [1.1, np.nan, 3.3], + "category": Categorical(["X", "Y", "Z"]), + "object": ["a", "b", "c"], + "datetime64[ns]": dt_data, + "datetime64[ns, US/Eastern]": tz_data, + "timedelta64[ns]": td_data, + "period[M]": period_data, +} + class TestConcatAppendCommon: """ Test common dtype coercion rules between concat and append. """ - def setup_method(self, method): - - dt_data = [ - pd.Timestamp("2011-01-01"), - pd.Timestamp("2011-01-02"), - pd.Timestamp("2011-01-03"), - ] - tz_data = [ - pd.Timestamp("2011-01-01", tz="US/Eastern"), - pd.Timestamp("2011-01-02", tz="US/Eastern"), - pd.Timestamp("2011-01-03", tz="US/Eastern"), - ] - - td_data = [ - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - ] - - period_data = [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Period("2011-03", freq="M"), - ] - - self.data = { - "bool": [True, False, True], - "int64": [1, 2, 3], - "float64": [1.1, np.nan, 3.3], - "category": Categorical(["X", "Y", "Z"]), - "object": ["a", "b", "c"], - "datetime64[ns]": dt_data, - "datetime64[ns, US/Eastern]": tz_data, - "timedelta64[ns]": td_data, - "period[M]": period_data, - } + @pytest.fixture(params=sorted(data_dict.keys())) + def item(self, request): + key = request.param + return key, data_dict[key] + + item2 = item def _check_expected_dtype(self, obj, label): """ @@ -71,192 +73,189 @@ def _check_expected_dtype(self, obj, label): else: raise ValueError - def test_dtypes(self): + def test_dtypes(self, item): # to confirm test case covers intended dtypes - for typ, vals in self.data.items(): - self._check_expected_dtype(Index(vals), typ) - self._check_expected_dtype(Series(vals), typ) + typ, vals = item + self._check_expected_dtype(Index(vals), typ) + self._check_expected_dtype(Series(vals), typ) - def test_concatlike_same_dtypes(self): + def test_concatlike_same_dtypes(self, item): # GH 13660 - for typ1, vals1 in self.data.items(): + typ1, vals1 = item - vals2 = vals1 - vals3 = vals1 + vals2 = vals1 + vals3 = vals1 - if typ1 == "category": - exp_data = Categorical(list(vals1) + list(vals2)) - exp_data3 = Categorical(list(vals1) + list(vals2) + list(vals3)) - else: - exp_data = vals1 + vals2 - exp_data3 = vals1 + vals2 + vals3 - - # ----- Index ----- # - - # index.append - res = Index(vals1).append(Index(vals2)) - exp = Index(exp_data) - tm.assert_index_equal(res, exp) - - # 3 elements - res = Index(vals1).append([Index(vals2), Index(vals3)]) - exp = Index(exp_data3) - tm.assert_index_equal(res, exp) - - # index.append name mismatch - i1 = Index(vals1, name="x") - i2 = Index(vals2, name="y") - res = i1.append(i2) - exp = Index(exp_data) - tm.assert_index_equal(res, exp) - - # index.append name match - i1 = Index(vals1, name="x") - i2 = Index(vals2, name="x") - res = i1.append(i2) - exp = Index(exp_data, name="x") - tm.assert_index_equal(res, exp) - - # cannot append non-index - with pytest.raises(TypeError, match="all inputs must be Index"): - Index(vals1).append(vals2) - - with pytest.raises(TypeError, match="all inputs must be Index"): - Index(vals1).append([Index(vals2), vals3]) - - # ----- Series ----- # - - # series.append - res = Series(vals1).append(Series(vals2), ignore_index=True) - exp = Series(exp_data) - tm.assert_series_equal(res, exp, check_index_type=True) - - # concat - res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # 3 elements - res = Series(vals1).append( - [Series(vals2), Series(vals3)], ignore_index=True - ) - exp = Series(exp_data3) - tm.assert_series_equal(res, exp) - - res = pd.concat( - [Series(vals1), Series(vals2), Series(vals3)], - ignore_index=True, - ) - tm.assert_series_equal(res, exp) - - # name mismatch - s1 = Series(vals1, name="x") - s2 = Series(vals2, name="y") - res = s1.append(s2, ignore_index=True) - exp = Series(exp_data) - tm.assert_series_equal(res, exp, check_index_type=True) - - res = pd.concat([s1, s2], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # name match - s1 = Series(vals1, name="x") - s2 = Series(vals2, name="x") - res = s1.append(s2, ignore_index=True) - exp = Series(exp_data, name="x") - tm.assert_series_equal(res, exp, check_index_type=True) - - res = pd.concat([s1, s2], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # cannot append non-index - msg = ( - r"cannot concatenate object of type '.+'; " - "only Series and DataFrame objs are valid" - ) - with pytest.raises(TypeError, match=msg): - Series(vals1).append(vals2) - - with pytest.raises(TypeError, match=msg): - Series(vals1).append([Series(vals2), vals3]) - - with pytest.raises(TypeError, match=msg): - pd.concat([Series(vals1), vals2]) - - with pytest.raises(TypeError, match=msg): - pd.concat([Series(vals1), Series(vals2), vals3]) - - def test_concatlike_dtypes_coercion(self): + if typ1 == "category": + exp_data = Categorical(list(vals1) + list(vals2)) + exp_data3 = Categorical(list(vals1) + list(vals2) + list(vals3)) + else: + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = Index(vals1).append(Index(vals2)) + exp = Index(exp_data) + tm.assert_index_equal(res, exp) + + # 3 elements + res = Index(vals1).append([Index(vals2), Index(vals3)]) + exp = Index(exp_data3) + tm.assert_index_equal(res, exp) + + # index.append name mismatch + i1 = Index(vals1, name="x") + i2 = Index(vals2, name="y") + res = i1.append(i2) + exp = Index(exp_data) + tm.assert_index_equal(res, exp) + + # index.append name match + i1 = Index(vals1, name="x") + i2 = Index(vals2, name="x") + res = i1.append(i2) + exp = Index(exp_data, name="x") + tm.assert_index_equal(res, exp) + + # cannot append non-index + with pytest.raises(TypeError, match="all inputs must be Index"): + Index(vals1).append(vals2) + + with pytest.raises(TypeError, match="all inputs must be Index"): + Index(vals1).append([Index(vals2), vals3]) + + # ----- Series ----- # + + # series.append + res = Series(vals1).append(Series(vals2), ignore_index=True) + exp = Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = Series(vals1).append([Series(vals2), Series(vals3)], ignore_index=True) + exp = Series(exp_data3) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [Series(vals1), Series(vals2), Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) + + # name mismatch + s1 = Series(vals1, name="x") + s2 = Series(vals2, name="y") + res = s1.append(s2, ignore_index=True) + exp = Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # name match + s1 = Series(vals1, name="x") + s2 = Series(vals2, name="x") + res = s1.append(s2, ignore_index=True) + exp = Series(exp_data, name="x") + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # cannot append non-index + msg = ( + r"cannot concatenate object of type '.+'; " + "only Series and DataFrame objs are valid" + ) + with pytest.raises(TypeError, match=msg): + Series(vals1).append(vals2) + + with pytest.raises(TypeError, match=msg): + Series(vals1).append([Series(vals2), vals3]) + + with pytest.raises(TypeError, match=msg): + pd.concat([Series(vals1), vals2]) + + with pytest.raises(TypeError, match=msg): + pd.concat([Series(vals1), Series(vals2), vals3]) + + def test_concatlike_dtypes_coercion(self, item, item2): # GH 13660 - for typ1, vals1 in self.data.items(): - for typ2, vals2 in self.data.items(): - - vals3 = vals2 - - # basically infer - exp_index_dtype = None - exp_series_dtype = None - - if typ1 == typ2: - # same dtype is tested in test_concatlike_same_dtypes - continue - elif typ1 == "category" or typ2 == "category": - # TODO: suspicious - continue - - # specify expected dtype - if typ1 == "bool" and typ2 in ("int64", "float64"): - # series coerces to numeric based on numpy rule - # index doesn't because bool is object dtype - exp_series_dtype = typ2 - elif typ2 == "bool" and typ1 in ("int64", "float64"): - exp_series_dtype = typ1 - elif ( - typ1 == "datetime64[ns, US/Eastern]" - or typ2 == "datetime64[ns, US/Eastern]" - or typ1 == "timedelta64[ns]" - or typ2 == "timedelta64[ns]" - ): - exp_index_dtype = object - exp_series_dtype = object - - exp_data = vals1 + vals2 - exp_data3 = vals1 + vals2 + vals3 - - # ----- Index ----- # - - # index.append - res = Index(vals1).append(Index(vals2)) - exp = Index(exp_data, dtype=exp_index_dtype) - tm.assert_index_equal(res, exp) - - # 3 elements - res = Index(vals1).append([Index(vals2), Index(vals3)]) - exp = Index(exp_data3, dtype=exp_index_dtype) - tm.assert_index_equal(res, exp) - - # ----- Series ----- # - - # series.append - res = Series(vals1).append(Series(vals2), ignore_index=True) - exp = Series(exp_data, dtype=exp_series_dtype) - tm.assert_series_equal(res, exp, check_index_type=True) - - # concat - res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) - tm.assert_series_equal(res, exp, check_index_type=True) - - # 3 elements - res = Series(vals1).append( - [Series(vals2), Series(vals3)], ignore_index=True - ) - exp = Series(exp_data3, dtype=exp_series_dtype) - tm.assert_series_equal(res, exp) - - res = pd.concat( - [Series(vals1), Series(vals2), Series(vals3)], - ignore_index=True, - ) - tm.assert_series_equal(res, exp) + typ1, vals1 = item + typ2, vals2 = item2 + + vals3 = vals2 + + # basically infer + exp_index_dtype = None + exp_series_dtype = None + + if typ1 == typ2: + # same dtype is tested in test_concatlike_same_dtypes + return + elif typ1 == "category" or typ2 == "category": + # The `vals1 + vals2` below fails bc one of these is a Categorical + # instead of a list; we have separate dedicated tests for categorical + return + + # specify expected dtype + if typ1 == "bool" and typ2 in ("int64", "float64"): + # series coerces to numeric based on numpy rule + # index doesn't because bool is object dtype + exp_series_dtype = typ2 + elif typ2 == "bool" and typ1 in ("int64", "float64"): + exp_series_dtype = typ1 + elif ( + typ1 == "datetime64[ns, US/Eastern]" + or typ2 == "datetime64[ns, US/Eastern]" + or typ1 == "timedelta64[ns]" + or typ2 == "timedelta64[ns]" + ): + exp_index_dtype = object + exp_series_dtype = object + + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = Index(vals1).append(Index(vals2)) + exp = Index(exp_data, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # 3 elements + res = Index(vals1).append([Index(vals2), Index(vals3)]) + exp = Index(exp_data3, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # ----- Series ----- # + + # series.append + res = Series(vals1).append(Series(vals2), ignore_index=True) + exp = Series(exp_data, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([Series(vals1), Series(vals2)], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = Series(vals1).append([Series(vals2), Series(vals3)], ignore_index=True) + exp = Series(exp_data3, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp) + + res = pd.concat( + [Series(vals1), Series(vals2), Series(vals3)], + ignore_index=True, + ) + tm.assert_series_equal(res, exp) def test_concatlike_common_coerce_to_pandas_object(self): # GH 13626 diff --git a/pandas/tests/reshape/concat/test_index.py b/pandas/tests/reshape/concat/test_index.py index a4d6a41c7eb50..35cf670398664 100644 --- a/pandas/tests/reshape/concat/test_index.py +++ b/pandas/tests/reshape/concat/test_index.py @@ -257,3 +257,52 @@ def test_concat_multiindex_dfs_with_deepcopy(self): tm.assert_frame_equal(result_copy, expected) result_no_copy = concat(example_dict, names=["testname"]) tm.assert_frame_equal(result_no_copy, expected) + + @pytest.mark.parametrize( + "mi1_list", + [ + [["a"], range(2)], + [["b"], np.arange(2.0, 4.0)], + [["c"], ["A", "B"]], + [["d"], pd.date_range(start="2017", end="2018", periods=2)], + ], + ) + @pytest.mark.parametrize( + "mi2_list", + [ + [["a"], range(2)], + [["b"], np.arange(2.0, 4.0)], + [["c"], ["A", "B"]], + [["d"], pd.date_range(start="2017", end="2018", periods=2)], + ], + ) + def test_concat_with_various_multiindex_dtypes( + self, mi1_list: list, mi2_list: list + ): + # GitHub #23478 + mi1 = MultiIndex.from_product(mi1_list) + mi2 = MultiIndex.from_product(mi2_list) + + df1 = DataFrame(np.zeros((1, len(mi1))), columns=mi1) + df2 = DataFrame(np.zeros((1, len(mi2))), columns=mi2) + + if mi1_list[0] == mi2_list[0]: + expected_mi = MultiIndex( + levels=[mi1_list[0], list(mi1_list[1])], + codes=[[0, 0, 0, 0], [0, 1, 0, 1]], + ) + else: + expected_mi = MultiIndex( + levels=[ + mi1_list[0] + mi2_list[0], + list(mi1_list[1]) + list(mi2_list[1]), + ], + codes=[[0, 0, 1, 1], [0, 1, 2, 3]], + ) + + expected_df = DataFrame(np.zeros((1, len(expected_mi))), columns=expected_mi) + + with tm.assert_produces_warning(None): + result_df = concat((df1, df2), axis=1) + + tm.assert_frame_equal(expected_df, result_df) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 371a7fed543e4..2f9f31ebb0485 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -597,7 +597,7 @@ def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na): tm.assert_frame_equal(actual, expected) def test_merge_nosort(self): - # GH#2098, TODO: anything to do? + # GH#2098 d = { "var1": np.random.randint(0, 10, size=10), diff --git a/pandas/tests/series/indexing/test_get.py b/pandas/tests/series/indexing/test_get.py index 23db91e25125d..e8034bd4f7160 100644 --- a/pandas/tests/series/indexing/test_get.py +++ b/pandas/tests/series/indexing/test_get.py @@ -158,8 +158,7 @@ def test_get_with_default(): "arr", [np.random.randn(10), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")], ) -def test_get2(arr): - # TODO: better name, possibly split +def test_get_with_ea(arr): # GH#21260 ser = Series(arr, index=[2 * i for i in range(len(arr))]) assert ser.get(4) == ser.iloc[2] diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 7082edb877ec1..3e8e1b3f436ec 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -107,7 +107,7 @@ def test_setitem_with_tz(self, tz, indexer_sli): tm.assert_series_equal(ser, exp) def test_setitem_with_tz_dst(self, indexer_sli): - # GH XXX TODO: fill in GH ref + # GH#14146 trouble setting values near DST boundary tz = "US/Eastern" orig = Series(date_range("2016-11-06", freq="H", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 9dfb361527b3e..9efd4db14541b 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -5,6 +5,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import IntervalArray class TestSeriesReplace: @@ -148,20 +149,21 @@ def test_replace_with_single_list(self): tm.assert_series_equal(s, ser) def test_replace_mixed_types(self): - s = pd.Series(np.arange(5), dtype="int64") + ser = pd.Series(np.arange(5), dtype="int64") def check_replace(to_rep, val, expected): - sc = s.copy() - r = s.replace(to_rep, val) + sc = ser.copy() + result = ser.replace(to_rep, val) return_value = sc.replace(to_rep, val, inplace=True) assert return_value is None - tm.assert_series_equal(expected, r) + tm.assert_series_equal(expected, result) tm.assert_series_equal(expected, sc) - # MUST upcast to float - e = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) + # 3.0 can still be held in our int64 series, so we do not upcast GH#44940 tr, v = [3], [3.0] - check_replace(tr, v, e) + check_replace(tr, v, ser) + # Note this matches what we get with the scalars 3 and 3.0 + check_replace(tr[0], v[0], ser) # MUST upcast to float e = pd.Series([0, 1, 2, 3.5, 4]) @@ -257,10 +259,10 @@ def test_replace2(self): assert (ser[20:30] == -1).all() def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): - # GH 32621 - s = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) - expected = pd.Series(["1", "2", np.nan]) - result = s.replace({"one": "1", "two": "2"}) + # GH 32621, GH#44940 + ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) + expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype) + result = ser.replace({"one": "1", "two": "2"}) tm.assert_series_equal(expected, result) def test_replace_with_empty_dictlike(self): @@ -305,17 +307,18 @@ def test_replace_mixed_types_with_string(self): "categorical, numeric", [ (pd.Categorical(["A"], categories=["A", "B"]), [1]), - (pd.Categorical(("A",), categories=["A", "B"]), [1]), - (pd.Categorical(("A", "B"), categories=["A", "B"]), [1, 2]), + (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), ], ) def test_replace_categorical(self, categorical, numeric): - # GH 24971 - # Do not check if dtypes are equal due to a known issue that - # Categorical.replace sometimes coerces to object (GH 23305) - s = pd.Series(categorical) - result = s.replace({"A": 1, "B": 2}) - expected = pd.Series(numeric) + # GH 24971, GH#23305 + ser = pd.Series(categorical) + result = ser.replace({"A": 1, "B": 2}) + expected = pd.Series(numeric).astype("category") + if 2 not in expected.cat.categories: + # i.e. categories should be [1, 2] even if there are no "B"s present + # GH#44940 + expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result) def test_replace_categorical_single(self): @@ -515,6 +518,7 @@ def test_pandas_replace_na(self): exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) +<<<<<<< HEAD @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 @@ -522,3 +526,91 @@ def test_replace_regex_dtype_series(self, regex): expected = pd.Series([1]) result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) +======= + @pytest.mark.parametrize( + "dtype, input_data, to_replace, expected_data", + [ + ("bool", [True, False], {True: False}, [False, False]), + ("int64", [1, 2], {1: 10, 2: 20}, [10, 20]), + ("Int64", [1, 2], {1: 10, 2: 20}, [10, 20]), + ("float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), + ("Float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), + ("string", ["one", "two"], {"one": "1", "two": "2"}, ["1", "2"]), + ( + pd.IntervalDtype("int64"), + IntervalArray([pd.Interval(1, 2), pd.Interval(2, 3)]), + {pd.Interval(1, 2): pd.Interval(10, 20)}, + IntervalArray([pd.Interval(10, 20), pd.Interval(2, 3)]), + ), + ( + pd.IntervalDtype("float64"), + IntervalArray([pd.Interval(1.0, 2.7), pd.Interval(2.8, 3.1)]), + {pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)}, + IntervalArray([pd.Interval(10.6, 20.8), pd.Interval(2.8, 3.1)]), + ), + ( + pd.PeriodDtype("M"), + [pd.Period("2020-05", freq="M")], + {pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M")}, + [pd.Period("2020-06", freq="M")], + ), + ], + ) + def test_replace_dtype(self, dtype, input_data, to_replace, expected_data): + # GH#33484 + ser = pd.Series(input_data, dtype=dtype) + result = ser.replace(to_replace) + expected = pd.Series(expected_data, dtype=dtype) + tm.assert_series_equal(result, expected) + + def test_replace_string_dtype(self): + # GH#40732, GH#44940 + ser = pd.Series(["one", "two", np.nan], dtype="string") + res = ser.replace({"one": "1", "two": "2"}) + expected = pd.Series(["1", "2", np.nan], dtype="string") + tm.assert_series_equal(res, expected) + + # GH#31644 + ser2 = pd.Series(["A", np.nan], dtype="string") + res2 = ser2.replace("A", "B") + expected2 = pd.Series(["B", np.nan], dtype="string") + tm.assert_series_equal(res2, expected2) + + ser3 = pd.Series(["A", "B"], dtype="string") + res3 = ser3.replace("A", pd.NA) + expected3 = pd.Series([pd.NA, "B"], dtype="string") + tm.assert_series_equal(res3, expected3) + + def test_replace_string_dtype_list_to_replace(self): + # GH#41215, GH#44940 + ser = pd.Series(["abc", "def"], dtype="string") + res = ser.replace(["abc", "any other string"], "xyz") + expected = pd.Series(["xyz", "def"], dtype="string") + tm.assert_series_equal(res, expected) + + def test_replace_string_dtype_regex(self): + # GH#31644 + ser = pd.Series(["A", "B"], dtype="string") + res = ser.replace(r".", "C", regex=True) + expected = pd.Series(["C", "C"], dtype="string") + tm.assert_series_equal(res, expected) + + def test_replace_nullable_numeric(self): + # GH#40732, GH#44940 + + floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) + assert floats.replace({1.0: 9}).dtype == floats.dtype + assert floats.replace(1.0, 9).dtype == floats.dtype + assert floats.replace({1.0: 9.0}).dtype == floats.dtype + assert floats.replace(1.0, 9.0).dtype == floats.dtype + + res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) + assert res.dtype == floats.dtype + + ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) + assert ints.replace({1: 9}).dtype == ints.dtype + assert ints.replace(1, 9).dtype == ints.dtype + assert ints.replace({1: 9.0}).dtype == ints.dtype + assert ints.replace(1, 9.0).dtype == ints.dtype + # FIXME: ints.replace({1: 9.5}) raises bc of incorrect _can_hold_element +>>>>>>> 3e9f09fa63c6231a36075317d8cd8bb2f3672901 diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index cbe4f950494be..74458c13e8df7 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -434,10 +434,10 @@ def test_split_with_name_index(): ], ], ) -def test_partition_series_more_than_one_char(method, exp): +def test_partition_series_more_than_one_char(method, exp, any_string_dtype): # https://github.com/pandas-dev/pandas/issues/23558 # more than one char - s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None]) + s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype) result = getattr(s.str, method)("__", expand=False) expected = Series(exp) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 17efdd7e4f98b..fcb50e463d9f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1721,9 +1721,9 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): ], ) def test_hashtable_large_sizehint(self, hashtable): - # GH 22729 + # GH#22729 smoketest for not raising when passing a large size_hint size_hint = np.iinfo(np.uint32).max + 1 - tbl = hashtable(size_hint=size_hint) # noqa + hashtable(size_hint=size_hint) def test_unique_label_indices(): diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 6a39638af9c87..d32c72b3df974 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -25,10 +25,24 @@ ) +@pytest.fixture +def left_right(): + low, high, n = -1 << 10, 1 << 10, 1 << 20 + left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) + left["left"] = left.sum(axis=1) + + # one-2-one match + i = np.random.permutation(len(left)) + right = left.iloc[i].copy() + right.columns = right.columns[:-1].tolist() + ["right"] + right.index = np.arange(len(right)) + right["right"] *= -1 + return left, right + + class TestSorting: @pytest.mark.slow def test_int64_overflow(self): - B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) A = np.arange(2500) df = DataFrame( @@ -67,17 +81,18 @@ def test_int64_overflow(self): assert left[k] == v assert len(left) == len(right) - def test_int64_overflow_moar(self): - + def test_int64_overflow_groupby_large_range(self): # GH9096 values = range(55109) data = DataFrame.from_dict({"a": values, "b": values, "c": values, "d": values}) grouped = data.groupby(["a", "b", "c", "d"]) assert len(grouped) == len(values) + @pytest.mark.parametrize("agg", ["mean", "median"]) + def test_int64_overflow_groupby_large_df_shuffled(self, agg): arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) i = np.random.choice(len(arr), len(arr) * 4) - arr = np.vstack((arr, arr[i])) # add sume duplicate rows + arr = np.vstack((arr, arr[i])) # add some duplicate rows i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows @@ -98,42 +113,98 @@ def test_int64_overflow_moar(self): assert len(gr) == len(jim) mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde")) - def aggr(func): - f = lambda a: np.fromiter(map(func, a), dtype="f8") - arr = np.vstack((f(jim.values()), f(joe.values()))).T - res = DataFrame(arr, columns=["jim", "joe"], index=mi) - return res.sort_index() - - tm.assert_frame_equal(gr.mean(), aggr(np.mean)) - tm.assert_frame_equal(gr.median(), aggr(np.median)) - - def test_lexsort_indexer(self): + f = lambda a: np.fromiter(map(getattr(np, agg), a), dtype="f8") + arr = np.vstack((f(jim.values()), f(joe.values()))).T + res = DataFrame(arr, columns=["jim", "joe"], index=mi).sort_index() + + tm.assert_frame_equal(getattr(gr, agg)(), res) + + @pytest.mark.parametrize( + "order, na_position, exp", + [ + [ + True, + "last", + list(range(5, 105)) + list(range(5)) + list(range(105, 110)), + ], + [ + True, + "first", + list(range(5)) + list(range(105, 110)) + list(range(5, 105)), + ], + [ + False, + "last", + list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)), + ], + [ + False, + "first", + list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)), + ], + ], + ) + def test_lexsort_indexer(self, order, na_position, exp): keys = [[np.nan] * 5 + list(range(100)) + [np.nan] * 5] - # orders=True, na_position='last' - result = lexsort_indexer(keys, orders=True, na_position="last") - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=True, na_position='first' - result = lexsort_indexer(keys, orders=True, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='last' - result = lexsort_indexer(keys, orders=False, na_position="last") - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - - # orders=False, na_position='first' - result = lexsort_indexer(keys, orders=False, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + result = lexsort_indexer(keys, orders=order, na_position=na_position) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) - def test_nargsort(self): - # np.argsort(items) places NaNs last - items = [np.nan] * 5 + list(range(100)) + [np.nan] * 5 - # np.argsort(items2) may not place NaNs first - items2 = np.array(items, dtype="O") + @pytest.mark.parametrize( + "ascending, na_position, exp, box", + [ + [ + True, + "last", + list(range(5, 105)) + list(range(5)) + list(range(105, 110)), + list, + ], + [ + True, + "first", + list(range(5)) + list(range(105, 110)) + list(range(5, 105)), + list, + ], + [ + False, + "last", + list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)), + list, + ], + [ + False, + "first", + list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)), + list, + ], + [ + True, + "last", + list(range(5, 105)) + list(range(5)) + list(range(105, 110)), + lambda x: np.array(x, dtype="O"), + ], + [ + True, + "first", + list(range(5)) + list(range(105, 110)) + list(range(5, 105)), + lambda x: np.array(x, dtype="O"), + ], + [ + False, + "last", + list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)), + lambda x: np.array(x, dtype="O"), + ], + [ + False, + "first", + list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)), + lambda x: np.array(x, dtype="O"), + ], + ], + ) + def test_nargsort(self, ascending, na_position, exp, box): + # list places NaNs last, np.array(..., dtype="O") may not place NaNs first + items = box([np.nan] * 5 + list(range(100)) + [np.nan] * 5) # mergesort is the most difficult to get right because we want it to be # stable. @@ -143,71 +214,23 @@ def test_nargsort(self): # because quick and merge sort fall over to insertion sort for small # arrays.""" - # mergesort, ascending=True, na_position='last' - result = nargsort(items, kind="mergesort", ascending=True, na_position="last") - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = nargsort(items, kind="mergesort", ascending=True, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = nargsort(items, kind="mergesort", ascending=False, na_position="last") - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' - result = nargsort(items, kind="mergesort", ascending=False, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='last' - result = nargsort(items2, kind="mergesort", ascending=True, na_position="last") - exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=True, na_position='first' - result = nargsort(items2, kind="mergesort", ascending=True, na_position="first") - exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='last' - result = nargsort(items2, kind="mergesort", ascending=False, na_position="last") - exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) - - # mergesort, ascending=False, na_position='first' result = nargsort( - items2, kind="mergesort", ascending=False, na_position="first" + items, kind="mergesort", ascending=ascending, na_position=na_position ) - exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) class TestMerge: - @pytest.mark.slow - def test_int64_overflow_issues(self): - + def test_int64_overflow_outer_merge(self): # #2690, combinatorial explosion df1 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G1"]) df2 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G2"]) - - # it works! result = merge(df1, df2, how="outer") assert len(result) == 2000 - low, high, n = -1 << 10, 1 << 10, 1 << 20 - left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) - left["left"] = left.sum(axis=1) - - # one-2-one match - i = np.random.permutation(len(left)) - right = left.iloc[i].copy() - right.columns = right.columns[:-1].tolist() + ["right"] - right.index = np.arange(len(right)) - right["right"] *= -1 + @pytest.mark.slow + def test_int64_overflow_check_sum_col(self, left_right): + left, right = left_right out = merge(left, right, how="outer") assert len(out) == len(left) @@ -216,10 +239,19 @@ def test_int64_overflow_issues(self): tm.assert_series_equal(out["left"], result, check_names=False) assert result.name is None + @pytest.mark.slow + @pytest.mark.parametrize("how", ["left", "right", "outer", "inner"]) + def test_int64_overflow_how_merge(self, left_right, how): + left, right = left_right + + out = merge(left, right, how="outer") out.sort_values(out.columns.tolist(), inplace=True) out.index = np.arange(len(out)) - for how in ["left", "right", "outer", "inner"]: - tm.assert_frame_equal(out, merge(left, right, how=how, sort=True)) + tm.assert_frame_equal(out, merge(left, right, how=how, sort=True)) + + @pytest.mark.slow + def test_int64_overflow_sort_false_order(self, left_right): + left, right = left_right # check that left merge w/ sort=False maintains left frame order out = merge(left, right, how="left", sort=False) @@ -228,8 +260,12 @@ def test_int64_overflow_issues(self): out = merge(right, left, how="left", sort=False) tm.assert_frame_equal(right, out[right.columns.tolist()]) + @pytest.mark.slow + @pytest.mark.parametrize("how", ["left", "right", "outer", "inner"]) + @pytest.mark.parametrize("sort", [True, False]) + def test_int64_overflow_one_to_many_none_match(self, how, sort): # one-2-many/none match - n = 1 << 11 + low, high, n = -1 << 10, 1 << 10, 1 << 11 left = DataFrame( np.random.randint(low, high, (n, 7)).astype("int64"), columns=list("ABCDEFG"), @@ -300,12 +336,6 @@ def align(df): df.index = np.arange(len(df)) return df - def verify_order(df): - kcols = list("ABCDEFG") - tm.assert_frame_equal( - df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort") - ) - out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"]) out = align(out) @@ -316,84 +346,81 @@ def verify_order(df): "outer": np.ones(len(out), dtype="bool"), } - for how in ["left", "right", "outer", "inner"]: - mask = jmask[how] - frame = align(out[mask].copy()) - assert mask.all() ^ mask.any() or how == "outer" - - for sort in [False, True]: - res = merge(left, right, how=how, sort=sort) - if sort: - verify_order(res) - - # as in GH9092 dtypes break with outer/right join - tm.assert_frame_equal( - frame, align(res), check_dtype=how not in ("right", "outer") - ) - - -def test_decons(): - def testit(codes_list, shape): - group_index = get_group_index(codes_list, shape, sort=True, xnull=True) - codes_list2 = decons_group_index(group_index, shape) + mask = jmask[how] + frame = align(out[mask].copy()) + assert mask.all() ^ mask.any() or how == "outer" - for a, b in zip(codes_list, codes_list2): - tm.assert_numpy_array_equal(a, b) + res = merge(left, right, how=how, sort=sort) + if sort: + kcols = list("ABCDEFG") + tm.assert_frame_equal( + res[kcols].copy(), res[kcols].sort_values(kcols, kind="mergesort") + ) - shape = (4, 5, 6) - codes_list = [ - np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), - np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), - np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), - ] - testit(codes_list, shape) + # as in GH9092 dtypes break with outer/right join + # 2021-12-18: dtype does not break anymore + tm.assert_frame_equal(frame, align(res)) + + +@pytest.mark.parametrize( + "codes_list, shape", + [ + [ + [ + np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), + ], + (4, 5, 6), + ], + [ + [ + np.tile(np.arange(10000, dtype=np.int64), 5), + np.tile(np.arange(10000, dtype=np.int64), 5), + ], + (10000, 10000), + ], + ], +) +def test_decons(codes_list, shape): + group_index = get_group_index(codes_list, shape, sort=True, xnull=True) + codes_list2 = decons_group_index(group_index, shape) - shape = (10000, 10000) - codes_list = [ - np.tile(np.arange(10000, dtype=np.int64), 5), - np.tile(np.arange(10000, dtype=np.int64), 5), - ] - testit(codes_list, shape) + for a, b in zip(codes_list, codes_list2): + tm.assert_numpy_array_equal(a, b) class TestSafeSort: - def test_basic_sort(self): - values = [3, 1, 2, 0, 4] - result = safe_sort(values) - expected = np.array([0, 1, 2, 3, 4]) - tm.assert_numpy_array_equal(result, expected) - - values = list("baaacb") - result = safe_sort(values) - expected = np.array(list("aaabbc"), dtype="object") - tm.assert_numpy_array_equal(result, expected) - - values = [] - result = safe_sort(values) - expected = np.array([]) + @pytest.mark.parametrize( + "arg, exp", + [ + [[3, 1, 2, 0, 4], [0, 1, 2, 3, 4]], + [list("baaacb"), np.array(list("aaabbc"), dtype=object)], + [[], []], + ], + ) + def test_basic_sort(self, arg, exp): + result = safe_sort(arg) + expected = np.array(exp) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) - def test_codes(self, verify): + @pytest.mark.parametrize( + "codes, exp_codes, na_sentinel", + [ + [[0, 1, 1, 2, 3, 0, -1, 4], [3, 1, 1, 2, 0, 3, -1, 4], -1], + [[0, 1, 1, 2, 3, 0, 99, 4], [3, 1, 1, 2, 0, 3, 99, 4], 99], + [[], [], -1], + ], + ) + def test_codes(self, verify, codes, exp_codes, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) - codes = [0, 1, 1, 2, 3, 0, -1, 4] - result, result_codes = safe_sort(values, codes, verify=verify) - expected_codes = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_codes, expected_codes) - - # na_sentinel - codes = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_codes = safe_sort(values, codes, na_sentinel=99, verify=verify) - expected_codes = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_codes, expected_codes) - - codes = [] - result, result_codes = safe_sort(values, codes, verify=verify) - expected_codes = np.array([], dtype=np.intp) + result, result_codes = safe_sort( + values, codes, na_sentinel=na_sentinel, verify=verify + ) + expected_codes = np.array(exp_codes, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) @@ -411,12 +438,14 @@ def test_codes_out_of_bound(self, na_sentinel): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) - def test_mixed_integer(self): - values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) + @pytest.mark.parametrize("box", [lambda x: np.array(x, dtype=object), list]) + def test_mixed_integer(self, box): + values = box(["b", 1, 0, "a", 0, "b"]) result = safe_sort(values) expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) + def test_mixed_integer_with_codes(self): values = np.array(["b", 1, 0, "a"], dtype=object) codes = [0, 1, 2, 3, 0, -1, 1] result, result_codes = safe_sort(values, codes) @@ -425,12 +454,6 @@ def test_mixed_integer(self): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) - def test_mixed_integer_from_list(self): - values = ["b", 1, 0, "a", 0, "b"] - result = safe_sort(values) - expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) - tm.assert_numpy_array_equal(result, expected) - def test_unsortable(self): # GH 13714 arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) @@ -438,22 +461,25 @@ def test_unsortable(self): with pytest.raises(TypeError, match=msg): safe_sort(arr) - def test_exceptions(self): - with pytest.raises(TypeError, match="Only list-like objects are allowed"): - safe_sort(values=1) - - with pytest.raises(TypeError, match="Only list-like objects or None"): - safe_sort(values=[0, 1, 2], codes=1) - - with pytest.raises(ValueError, match="values should be unique"): - safe_sort(values=[0, 1, 2, 1], codes=[0, 1]) - - def test_extension_array(self): - # a = array([1, 3, np.nan, 2], dtype='Int64') - a = array([1, 3, 2], dtype="Int64") + @pytest.mark.parametrize( + "arg, codes, err, msg", + [ + [1, None, TypeError, "Only list-like objects are allowed"], + [[0, 1, 2], 1, TypeError, "Only list-like objects or None"], + [[0, 1, 2, 1], [0, 1], ValueError, "values should be unique"], + ], + ) + def test_exceptions(self, arg, codes, err, msg): + with pytest.raises(err, match=msg): + safe_sort(values=arg, codes=codes) + + @pytest.mark.parametrize( + "arg, exp", [[[1, 3, 2], [1, 2, 3]], [[1, 3, np.nan, 2], [1, 2, 3, np.nan]]] + ) + def test_extension_array(self, arg, exp): + a = array(arg, dtype="Int64") result = safe_sort(a) - # expected = array([1, 2, 3, np.nan], dtype='Int64') - expected = array([1, 2, 3], dtype="Int64") + expected = array(exp, dtype="Int64") tm.assert_extension_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 6cec35939dff2..5dcfd0019e93f 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -668,14 +668,6 @@ def test_rule_code(self): assert alias == (_get_offset(alias) * 5).rule_code -def test_dateoffset_misc(): - oset = offsets.DateOffset(months=2, days=4) - # it works - oset.freqstr - - assert not offsets.DateOffset(months=2) == 2 - - def test_freq_offsets(): off = BDay(1, offset=timedelta(0, 1800)) assert off.freqstr == "B+30Min" @@ -791,6 +783,54 @@ def test_tick_normalize_raises(tick_classes): cls(n=3, normalize=True) +@pytest.mark.parametrize( + "offset_kwargs, expected_arg", + [ + ({"nanoseconds": 1}, "1970-01-01 00:00:00.000000001"), + ({"nanoseconds": 5}, "1970-01-01 00:00:00.000000005"), + ({"nanoseconds": -1}, "1969-12-31 23:59:59.999999999"), + ({"microseconds": 1}, "1970-01-01 00:00:00.000001"), + ({"microseconds": -1}, "1969-12-31 23:59:59.999999"), + ({"seconds": 1}, "1970-01-01 00:00:01"), + ({"seconds": -1}, "1969-12-31 23:59:59"), + ({"minutes": 1}, "1970-01-01 00:01:00"), + ({"minutes": -1}, "1969-12-31 23:59:00"), + ({"hours": 1}, "1970-01-01 01:00:00"), + ({"hours": -1}, "1969-12-31 23:00:00"), + ({"days": 1}, "1970-01-02 00:00:00"), + ({"days": -1}, "1969-12-31 00:00:00"), + ({"weeks": 1}, "1970-01-08 00:00:00"), + ({"weeks": -1}, "1969-12-25 00:00:00"), + ({"months": 1}, "1970-02-01 00:00:00"), + ({"months": -1}, "1969-12-01 00:00:00"), + ({"years": 1}, "1971-01-01 00:00:00"), + ({"years": -1}, "1969-01-01 00:00:00"), + ], +) +def test_dateoffset_add_sub(offset_kwargs, expected_arg): + offset = DateOffset(**offset_kwargs) + ts = Timestamp(0) + result = ts + offset + expected = Timestamp(expected_arg) + assert result == expected + result -= offset + assert result == ts + result = offset + ts + assert result == expected + + +def test_dataoffset_add_sub_timestamp_with_nano(): + offset = DateOffset(minutes=2, nanoseconds=9) + ts = Timestamp(4) + result = ts + offset + expected = Timestamp("1970-01-01 00:02:00.000000013") + assert result == expected + result -= offset + assert result == ts + result = offset + ts + assert result == expected + + @pytest.mark.parametrize( "attribute", [ @@ -806,3 +846,11 @@ def test_dateoffset_immutable(attribute): msg = "DateOffset objects are immutable" with pytest.raises(AttributeError, match=msg): setattr(offset, attribute, 5) + + +def test_dateoffset_misc(): + oset = offsets.DateOffset(months=2, days=4) + # it works + oset.freqstr + + assert not offsets.DateOffset(months=2) == 2 diff --git a/pandas/tests/window/moments/conftest.py b/pandas/tests/window/moments/conftest.py index b192f72c8f08b..8f7c20fe03a02 100644 --- a/pandas/tests/window/moments/conftest.py +++ b/pandas/tests/window/moments/conftest.py @@ -10,41 +10,50 @@ ) -# create the data only once as we are not setting it -def _create_consistency_data(): - def create_series(): - return [ - Series(dtype=np.float64, name="a"), - Series([np.nan] * 5), - Series([1.0] * 5), - Series(range(5, 0, -1)), - Series(range(5)), - Series([np.nan, 1.0, np.nan, 1.0, 1.0]), - Series([np.nan, 1.0, np.nan, 2.0, 3.0]), - Series([np.nan, 1.0, np.nan, 3.0, 2.0]), - ] - - def create_dataframes(): - return [ - DataFrame(columns=["a", "a"]), - DataFrame(np.arange(15).reshape((5, 3)), columns=["a", "a", 99]), - ] + [DataFrame(s) for s in create_series()] - - def is_constant(x): - values = x.values.ravel("K") - return len(set(values[notna(values)])) == 1 - - def no_nans(x): - return x.notna().all().all() - +def create_series(): return [ - (x, is_constant(x), no_nans(x)) - for x in itertools.chain(create_dataframes(), create_dataframes()) + Series(dtype=np.float64, name="a"), + Series([np.nan] * 5), + Series([1.0] * 5), + Series(range(5, 0, -1)), + Series(range(5)), + Series([np.nan, 1.0, np.nan, 1.0, 1.0]), + Series([np.nan, 1.0, np.nan, 2.0, 3.0]), + Series([np.nan, 1.0, np.nan, 3.0, 2.0]), ] -@pytest.fixture(params=_create_consistency_data()) -def consistency_data(request): +def create_dataframes(): + return [ + DataFrame(columns=["a", "a"]), + DataFrame(np.arange(15).reshape((5, 3)), columns=["a", "a", 99]), + ] + [DataFrame(s) for s in create_series()] + + +def is_constant(x): + values = x.values.ravel("K") + return len(set(values[notna(values)])) == 1 + + +@pytest.fixture( + params=( + obj + for obj in itertools.chain(create_series(), create_dataframes()) + if is_constant(obj) + ), + scope="module", +) +def consistent_data(request): + return request.param + + +@pytest.fixture(params=create_series()) +def series_data(request): + return request.param + + +@pytest.fixture(params=itertools.chain(create_series(), create_dataframes())) +def all_data(request): """ Test: - Empty Series / DataFrame diff --git a/pandas/tests/window/moments/test_moments_consistency_ewm.py b/pandas/tests/window/moments/test_moments_consistency_ewm.py index 8feec32ba99c5..f9f09bffb14b1 100644 --- a/pandas/tests/window/moments/test_moments_consistency_ewm.py +++ b/pandas/tests/window/moments/test_moments_consistency_ewm.py @@ -30,7 +30,7 @@ def create_mock_weights(obj, com, adjust, ignore_na): def create_mock_series_weights(s, com, adjust, ignore_na): - w = Series(np.nan, index=s.index) + w = Series(np.nan, index=s.index, name=s.name) alpha = 1.0 / (1.0 + com) if adjust: count = 0 @@ -58,63 +58,66 @@ def create_mock_series_weights(s, com, adjust, ignore_na): return w -def test_ewm_consistency_mean(consistency_data, adjust, ignore_na, min_periods): - x, is_constant, no_nans = consistency_data +def test_ewm_consistency_mean(all_data, adjust, ignore_na, min_periods): com = 3.0 - result = x.ewm( + result = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).mean() - weights = create_mock_weights(x, com=com, adjust=adjust, ignore_na=ignore_na) + weights = create_mock_weights(all_data, com=com, adjust=adjust, ignore_na=ignore_na) expected = ( - x.multiply(weights).cumsum().divide(weights.cumsum()).fillna(method="ffill") + all_data.multiply(weights) + .cumsum() + .divide(weights.cumsum()) + .fillna(method="ffill") ) expected[ - x.expanding().count() < (max(min_periods, 1) if min_periods else 1) + all_data.expanding().count() < (max(min_periods, 1) if min_periods else 1) ] = np.nan tm.assert_equal(result, expected.astype("float64")) -def test_ewm_consistency_consistent(consistency_data, adjust, ignore_na, min_periods): - x, is_constant, no_nans = consistency_data +def test_ewm_consistency_consistent(consistent_data, adjust, ignore_na, min_periods): com = 3.0 - if is_constant: - count_x = x.expanding().count() - mean_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean() - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).corr(x) - exp = x.max() if isinstance(x, Series) else x.max().max() + count_x = consistent_data.expanding().count() + mean_x = consistent_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean() + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = consistent_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(consistent_data) + exp = ( + consistent_data.max() + if isinstance(consistent_data, Series) + else consistent_data.max().max() + ) - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) + # check mean of constant series + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) def test_ewm_consistency_var_debiasing_factors( - consistency_data, adjust, ignore_na, min_periods + all_data, adjust, ignore_na, min_periods ): - x, is_constant, no_nans = consistency_data com = 3.0 # check variance debiasing factors - var_unbiased_x = x.ewm( + var_unbiased_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).var(bias=False) - var_biased_x = x.ewm( + var_biased_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).var(bias=True) - weights = create_mock_weights(x, com=com, adjust=adjust, ignore_na=ignore_na) + weights = create_mock_weights(all_data, com=com, adjust=adjust, ignore_na=ignore_na) cum_sum = weights.cumsum().fillna(method="ffill") cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") numerator = cum_sum * cum_sum @@ -126,16 +129,13 @@ def test_ewm_consistency_var_debiasing_factors( @pytest.mark.parametrize("bias", [True, False]) -def test_moments_consistency_var( - consistency_data, adjust, ignore_na, min_periods, bias -): - x, is_constant, no_nans = consistency_data +def test_moments_consistency_var(all_data, adjust, ignore_na, min_periods, bias): com = 3.0 - mean_x = x.ewm( + mean_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).mean() - var_x = x.ewm( + var_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).var(bias=bias) assert not (var_x < 0).any().any() @@ -143,7 +143,7 @@ def test_moments_consistency_var( if bias: # check that biased var(x) == mean(x^2) - mean(x)^2 mean_x2 = ( - (x * x) + (all_data * all_data) .ewm(com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na) .mean() ) @@ -152,35 +152,32 @@ def test_moments_consistency_var( @pytest.mark.parametrize("bias", [True, False]) def test_moments_consistency_var_constant( - consistency_data, adjust, ignore_na, min_periods, bias + consistent_data, adjust, ignore_na, min_periods, bias ): - x, is_constant, no_nans = consistency_data com = 3.0 - if is_constant: - count_x = x.expanding(min_periods=min_periods).count() - var_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=bias) + count_x = consistent_data.expanding(min_periods=min_periods).count() + var_x = consistent_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=bias) - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if not bias: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if not bias: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) @pytest.mark.parametrize("bias", [True, False]) -def test_ewm_consistency_std(consistency_data, adjust, ignore_na, min_periods, bias): - x, is_constant, no_nans = consistency_data +def test_ewm_consistency_std(all_data, adjust, ignore_na, min_periods, bias): com = 3.0 - var_x = x.ewm( + var_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).var(bias=bias) assert not (var_x < 0).any().any() - std_x = x.ewm( + std_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na ).std(bias=bias) assert not (std_x < 0).any().any() @@ -188,9 +185,9 @@ def test_ewm_consistency_std(consistency_data, adjust, ignore_na, min_periods, b # check that var(x) == std(x)^2 tm.assert_equal(var_x, std_x * std_x) - cov_x_x = x.ewm( + cov_x_x = all_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(x, bias=bias) + ).cov(all_data, bias=bias) assert not (cov_x_x < 0).any().any() # check that var(x) == cov(x, x) @@ -199,57 +196,53 @@ def test_ewm_consistency_std(consistency_data, adjust, ignore_na, min_periods, b @pytest.mark.parametrize("bias", [True, False]) def test_ewm_consistency_series_cov_corr( - consistency_data, adjust, ignore_na, min_periods, bias + series_data, adjust, ignore_na, min_periods, bias ): - x, is_constant, no_nans = consistency_data com = 3.0 - if isinstance(x, Series): - var_x_plus_y = ( - (x + x) - .ewm(com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na) - .var(bias=bias) - ) - var_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=bias) - var_y = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=bias) - cov_x_y = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(x, bias=bias) - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) - - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - corr_x_y = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).corr(x, bias=bias) - std_x = x.ewm( + var_x_plus_y = ( + (series_data + series_data) + .ewm(com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na) + .var(bias=bias) + ) + var_x = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=bias) + var_y = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=bias) + cov_x_y = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(series_data, bias=bias) + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + corr_x_y = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(series_data, bias=bias) + std_x = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=bias) + std_y = series_data.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=bias) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + + if bias: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_x = series_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=bias) - std_y = x.ewm( + ).mean() + mean_y = series_data.ewm( com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=bias) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if bias: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_x = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean() - mean_y = x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean() - mean_x_times_y = ( - (x * x) - .ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ) - .mean() - ) - tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) + ).mean() + mean_x_times_y = ( + (series_data * series_data) + .ewm(com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na) + .mean() + ) + tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) diff --git a/pandas/tests/window/moments/test_moments_consistency_expanding.py b/pandas/tests/window/moments/test_moments_consistency_expanding.py index 14314f80f152c..dafc60a057c0f 100644 --- a/pandas/tests/window/moments/test_moments_consistency_expanding.py +++ b/pandas/tests/window/moments/test_moments_consistency_expanding.py @@ -5,67 +5,68 @@ import pandas._testing as tm +def no_nans(x): + return x.notna().all().all() + + +def all_na(x): + return x.isnull().all().all() + + @pytest.mark.parametrize("f", [lambda v: Series(v).sum(), np.nansum, np.sum]) -def test_expanding_apply_consistency_sum_nans(consistency_data, min_periods, f): - x, is_constant, no_nans = consistency_data - - if f is np.nansum and min_periods == 0: - pass - elif f is np.sum and not no_nans: - pass - else: - expanding_f_result = x.expanding(min_periods=min_periods).sum() - expanding_apply_f_result = x.expanding(min_periods=min_periods).apply( - func=f, raw=True - ) - tm.assert_equal(expanding_f_result, expanding_apply_f_result) +def test_expanding_apply_consistency_sum_nans(request, all_data, min_periods, f): + if f is np.sum: + if not no_nans(all_data) and not ( + all_na(all_data) and not all_data.empty and min_periods > 0 + ): + request.node.add_marker( + pytest.mark.xfail(reason="np.sum has different behavior with NaNs") + ) + expanding_f_result = all_data.expanding(min_periods=min_periods).sum() + expanding_apply_f_result = all_data.expanding(min_periods=min_periods).apply( + func=f, raw=True + ) + tm.assert_equal(expanding_f_result, expanding_apply_f_result) @pytest.mark.parametrize("ddof", [0, 1]) -def test_moments_consistency_var(consistency_data, min_periods, ddof): - x, is_constant, no_nans = consistency_data - - var_x = x.expanding(min_periods=min_periods).var(ddof=ddof) +def test_moments_consistency_var(all_data, min_periods, ddof): + var_x = all_data.expanding(min_periods=min_periods).var(ddof=ddof) assert not (var_x < 0).any().any() if ddof == 0: # check that biased var(x) == mean(x^2) - mean(x)^2 - mean_x2 = (x * x).expanding(min_periods=min_periods).mean() - mean_x = x.expanding(min_periods=min_periods).mean() + mean_x2 = (all_data * all_data).expanding(min_periods=min_periods).mean() + mean_x = all_data.expanding(min_periods=min_periods).mean() tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) @pytest.mark.parametrize("ddof", [0, 1]) -def test_moments_consistency_var_constant(consistency_data, min_periods, ddof): - x, is_constant, no_nans = consistency_data +def test_moments_consistency_var_constant(consistent_data, min_periods, ddof): + count_x = consistent_data.expanding(min_periods=min_periods).count() + var_x = consistent_data.expanding(min_periods=min_periods).var(ddof=ddof) - if is_constant: - count_x = x.expanding(min_periods=min_periods).count() - var_x = x.expanding(min_periods=min_periods).var(ddof=ddof) - - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if ddof == 1: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if ddof == 1: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) @pytest.mark.parametrize("ddof", [0, 1]) -def test_expanding_consistency_var_std_cov(consistency_data, min_periods, ddof): - x, is_constant, no_nans = consistency_data - - var_x = x.expanding(min_periods=min_periods).var(ddof=ddof) +def test_expanding_consistency_var_std_cov(all_data, min_periods, ddof): + var_x = all_data.expanding(min_periods=min_periods).var(ddof=ddof) assert not (var_x < 0).any().any() - std_x = x.expanding(min_periods=min_periods).std(ddof=ddof) + std_x = all_data.expanding(min_periods=min_periods).std(ddof=ddof) assert not (std_x < 0).any().any() # check that var(x) == std(x)^2 tm.assert_equal(var_x, std_x * std_x) - cov_x_x = x.expanding(min_periods=min_periods).cov(x, ddof=ddof) + cov_x_x = all_data.expanding(min_periods=min_periods).cov(all_data, ddof=ddof) assert not (cov_x_x < 0).any().any() # check that var(x) == cov(x, x) @@ -73,73 +74,71 @@ def test_expanding_consistency_var_std_cov(consistency_data, min_periods, ddof): @pytest.mark.parametrize("ddof", [0, 1]) -def test_expanding_consistency_series_cov_corr(consistency_data, min_periods, ddof): - x, is_constant, no_nans = consistency_data - - if isinstance(x, Series): - var_x_plus_y = (x + x).expanding(min_periods=min_periods).var(ddof=ddof) - var_x = x.expanding(min_periods=min_periods).var(ddof=ddof) - var_y = x.expanding(min_periods=min_periods).var(ddof=ddof) - cov_x_y = x.expanding(min_periods=min_periods).cov(x, ddof=ddof) - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) - - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - corr_x_y = x.expanding(min_periods=min_periods).corr(x) - std_x = x.expanding(min_periods=min_periods).std(ddof=ddof) - std_y = x.expanding(min_periods=min_periods).std(ddof=ddof) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if ddof == 0: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_x = x.expanding(min_periods=min_periods).mean() - mean_y = x.expanding(min_periods=min_periods).mean() - mean_x_times_y = (x * x).expanding(min_periods=min_periods).mean() - tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) - - -def test_expanding_consistency_mean(consistency_data, min_periods): - x, is_constant, no_nans = consistency_data - - result = x.expanding(min_periods=min_periods).mean() - expected = ( - x.expanding(min_periods=min_periods).sum() - / x.expanding(min_periods=min_periods).count() +def test_expanding_consistency_series_cov_corr(series_data, min_periods, ddof): + var_x_plus_y = ( + (series_data + series_data).expanding(min_periods=min_periods).var(ddof=ddof) ) - tm.assert_equal(result, expected.astype("float64")) + var_x = series_data.expanding(min_periods=min_periods).var(ddof=ddof) + var_y = series_data.expanding(min_periods=min_periods).var(ddof=ddof) + cov_x_y = series_data.expanding(min_periods=min_periods).cov(series_data, ddof=ddof) + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + corr_x_y = series_data.expanding(min_periods=min_periods).corr(series_data) + std_x = series_data.expanding(min_periods=min_periods).std(ddof=ddof) + std_y = series_data.expanding(min_periods=min_periods).std(ddof=ddof) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + if ddof == 0: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_x = series_data.expanding(min_periods=min_periods).mean() + mean_y = series_data.expanding(min_periods=min_periods).mean() + mean_x_times_y = ( + (series_data * series_data).expanding(min_periods=min_periods).mean() + ) + tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) -def test_expanding_consistency_constant(consistency_data, min_periods): - x, is_constant, no_nans = consistency_data - if is_constant: - count_x = x.expanding().count() - mean_x = x.expanding(min_periods=min_periods).mean() - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = x.expanding(min_periods=min_periods).corr(x) +def test_expanding_consistency_mean(all_data, min_periods): + result = all_data.expanding(min_periods=min_periods).mean() + expected = ( + all_data.expanding(min_periods=min_periods).sum() + / all_data.expanding(min_periods=min_periods).count() + ) + tm.assert_equal(result, expected.astype("float64")) - exp = x.max() if isinstance(x, Series) else x.max().max() - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) +def test_expanding_consistency_constant(consistent_data, min_periods): + count_x = consistent_data.expanding().count() + mean_x = consistent_data.expanding(min_periods=min_periods).mean() + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = consistent_data.expanding(min_periods=min_periods).corr(consistent_data) + + exp = ( + consistent_data.max() + if isinstance(consistent_data, Series) + else consistent_data.max().max() + ) - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) + # check mean of constant series + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) -def test_expanding_consistency_var_debiasing_factors(consistency_data, min_periods): - x, is_constant, no_nans = consistency_data +def test_expanding_consistency_var_debiasing_factors(all_data, min_periods): # check variance debiasing factors - var_unbiased_x = x.expanding(min_periods=min_periods).var() - var_biased_x = x.expanding(min_periods=min_periods).var(ddof=0) - var_debiasing_factors_x = x.expanding().count() / ( - x.expanding().count() - 1.0 + var_unbiased_x = all_data.expanding(min_periods=min_periods).var() + var_biased_x = all_data.expanding(min_periods=min_periods).var(ddof=0) + var_debiasing_factors_x = all_data.expanding().count() / ( + all_data.expanding().count() - 1.0 ).replace(0.0, np.nan) tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index 49bc5af4e9d69..daca19b0993bf 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -5,44 +5,52 @@ import pandas._testing as tm +def no_nans(x): + return x.notna().all().all() + + +def all_na(x): + return x.isnull().all().all() + + @pytest.mark.parametrize("f", [lambda v: Series(v).sum(), np.nansum, np.sum]) def test_rolling_apply_consistency_sum( - consistency_data, rolling_consistency_cases, center, f + request, all_data, rolling_consistency_cases, center, f ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - if f is np.nansum and min_periods == 0: - pass - elif f is np.sum and not no_nans: - pass - else: - rolling_f_result = x.rolling( - window=window, min_periods=min_periods, center=center - ).sum() - rolling_apply_f_result = x.rolling( - window=window, min_periods=min_periods, center=center - ).apply(func=f, raw=True) - tm.assert_equal(rolling_f_result, rolling_apply_f_result) + if f is np.sum: + if not no_nans(all_data) and not ( + all_na(all_data) and not all_data.empty and min_periods > 0 + ): + request.node.add_marker( + pytest.mark.xfail(reason="np.sum has different behavior with NaNs") + ) + rolling_f_result = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).sum() + rolling_apply_f_result = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).apply(func=f, raw=True) + tm.assert_equal(rolling_f_result, rolling_apply_f_result) @pytest.mark.parametrize("ddof", [0, 1]) -def test_moments_consistency_var( - consistency_data, rolling_consistency_cases, center, ddof -): - x, is_constant, no_nans = consistency_data +def test_moments_consistency_var(all_data, rolling_consistency_cases, center, ddof): window, min_periods = rolling_consistency_cases - var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( + var_x = all_data.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) assert not (var_x < 0).any().any() if ddof == 0: # check that biased var(x) == mean(x^2) - mean(x)^2 - mean_x = x.rolling(window=window, min_periods=min_periods, center=center).mean() + mean_x = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).mean() mean_x2 = ( - (x * x) + (all_data * all_data) .rolling(window=window, min_periods=min_periods, center=center) .mean() ) @@ -51,41 +59,38 @@ def test_moments_consistency_var( @pytest.mark.parametrize("ddof", [0, 1]) def test_moments_consistency_var_constant( - consistency_data, rolling_consistency_cases, center, ddof + consistent_data, rolling_consistency_cases, center, ddof ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - if is_constant: - count_x = x.rolling( - window=window, min_periods=min_periods, center=center - ).count() - var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( - ddof=ddof - ) + count_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).count() + var_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=ddof) - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if ddof == 1: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if ddof == 1: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) @pytest.mark.parametrize("ddof", [0, 1]) def test_rolling_consistency_var_std_cov( - consistency_data, rolling_consistency_cases, center, ddof + all_data, rolling_consistency_cases, center, ddof ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( + var_x = all_data.rolling(window=window, min_periods=min_periods, center=center).var( ddof=ddof ) assert not (var_x < 0).any().any() - std_x = x.rolling(window=window, min_periods=min_periods, center=center).std( + std_x = all_data.rolling(window=window, min_periods=min_periods, center=center).std( ddof=ddof ) assert not (std_x < 0).any().any() @@ -93,9 +98,9 @@ def test_rolling_consistency_var_std_cov( # check that var(x) == std(x)^2 tm.assert_equal(var_x, std_x * std_x) - cov_x_x = x.rolling(window=window, min_periods=min_periods, center=center).cov( - x, ddof=ddof - ) + cov_x_x = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).cov(all_data, ddof=ddof) assert not (cov_x_x < 0).any().any() # check that var(x) == cov(x, x) @@ -104,122 +109,128 @@ def test_rolling_consistency_var_std_cov( @pytest.mark.parametrize("ddof", [0, 1]) def test_rolling_consistency_series_cov_corr( - consistency_data, rolling_consistency_cases, center, ddof + series_data, rolling_consistency_cases, center, ddof ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - if isinstance(x, Series): - var_x_plus_y = ( - (x + x) - .rolling(window=window, min_periods=min_periods, center=center) - .var(ddof=ddof) - ) - var_x = x.rolling(window=window, min_periods=min_periods, center=center).var( - ddof=ddof - ) - var_y = x.rolling(window=window, min_periods=min_periods, center=center).var( - ddof=ddof - ) - cov_x_y = x.rolling(window=window, min_periods=min_periods, center=center).cov( - x, ddof=ddof - ) - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + var_x_plus_y = ( + (series_data + series_data) + .rolling(window=window, min_periods=min_periods, center=center) + .var(ddof=ddof) + ) + var_x = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=ddof) + var_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=ddof) + cov_x_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).cov(series_data, ddof=ddof) + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + tm.assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + corr_x_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).corr(series_data) + std_x = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).std(ddof=ddof) + std_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).std(ddof=ddof) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - corr_x_y = x.rolling( + if ddof == 0: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_x = series_data.rolling( window=window, min_periods=min_periods, center=center - ).corr(x) - std_x = x.rolling(window=window, min_periods=min_periods, center=center).std( - ddof=ddof - ) - std_y = x.rolling(window=window, min_periods=min_periods, center=center).std( - ddof=ddof + ).mean() + mean_y = series_data.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + mean_x_times_y = ( + (series_data * series_data) + .rolling(window=window, min_periods=min_periods, center=center) + .mean() ) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if ddof == 0: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_x = x.rolling( - window=window, min_periods=min_periods, center=center - ).mean() - mean_y = x.rolling( - window=window, min_periods=min_periods, center=center - ).mean() - mean_x_times_y = ( - (x * x) - .rolling(window=window, min_periods=min_periods, center=center) - .mean() - ) - tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) + tm.assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) -def test_rolling_consistency_mean(consistency_data, rolling_consistency_cases, center): - x, is_constant, no_nans = consistency_data +def test_rolling_consistency_mean(all_data, rolling_consistency_cases, center): window, min_periods = rolling_consistency_cases - result = x.rolling(window=window, min_periods=min_periods, center=center).mean() + result = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).mean() expected = ( - x.rolling(window=window, min_periods=min_periods, center=center) + all_data.rolling(window=window, min_periods=min_periods, center=center) .sum() .divide( - x.rolling(window=window, min_periods=min_periods, center=center).count() + all_data.rolling( + window=window, min_periods=min_periods, center=center + ).count() ) ) tm.assert_equal(result, expected.astype("float64")) def test_rolling_consistency_constant( - consistency_data, rolling_consistency_cases, center + consistent_data, rolling_consistency_cases, center ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases - if is_constant: - count_x = x.rolling( - window=window, min_periods=min_periods, center=center - ).count() - mean_x = x.rolling(window=window, min_periods=min_periods, center=center).mean() - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = x.rolling( - window=window, min_periods=min_periods, center=center - ).corr(x) + count_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).count() + mean_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = consistent_data.rolling( + window=window, min_periods=min_periods, center=center + ).corr(consistent_data) - exp = x.max() if isinstance(x, Series) else x.max().max() + exp = ( + consistent_data.max() + if isinstance(consistent_data, Series) + else consistent_data.max().max() + ) - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) + # check mean of constant series + expected = consistent_data * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) def test_rolling_consistency_var_debiasing_factors( - consistency_data, rolling_consistency_cases, center + all_data, rolling_consistency_cases, center ): - x, is_constant, no_nans = consistency_data window, min_periods = rolling_consistency_cases # check variance debiasing factors - var_unbiased_x = x.rolling( + var_unbiased_x = all_data.rolling( window=window, min_periods=min_periods, center=center ).var() - var_biased_x = x.rolling(window=window, min_periods=min_periods, center=center).var( - ddof=0 - ) + var_biased_x = all_data.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=0) var_debiasing_factors_x = ( - x.rolling(window=window, min_periods=min_periods, center=center) + all_data.rolling(window=window, min_periods=min_periods, center=center) .count() .divide( ( - x.rolling(window=window, min_periods=min_periods, center=center).count() + all_data.rolling( + window=window, min_periods=min_periods, center=center + ).count() - 1.0 ).replace(0.0, np.nan) ) From d00915b416871a54134b15ee6fe81de3f37f0d86 Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Mon, 20 Dec 2021 13:09:33 +0530 Subject: [PATCH 18/19] Moved testcase below --- pandas/tests/series/methods/test_replace.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index b08e6714fe90f..c0234ee2649b5 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -518,14 +518,6 @@ def test_pandas_replace_na(self): exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) - @pytest.mark.parametrize("regex", [False, True]) - def test_replace_regex_dtype_series(self, regex): - # GH-48644 - series = pd.Series(["0"]) - expected = pd.Series([1]) - result = series.replace(to_replace="0", value=1, regex=regex) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( "dtype, input_data, to_replace, expected_data", [ @@ -612,3 +604,11 @@ def test_replace_nullable_numeric(self): assert ints.replace({1: 9.0}).dtype == ints.dtype assert ints.replace(1, 9.0).dtype == ints.dtype # FIXME: ints.replace({1: 9.5}) raises bc of incorrect _can_hold_element + + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_series(self, regex): + # GH-48644 + series = pd.Series(["0"]) + expected = pd.Series([1]) + result = series.replace(to_replace="0", value=1, regex=regex) + tm.assert_series_equal(result, expected) From 0d93bddadaa605e8ee78de1107903b50e925c4a4 Mon Sep 17 00:00:00 2001 From: shubham11941140 Date: Wed, 22 Dec 2021 10:37:49 +0530 Subject: [PATCH 19/19] Changed Whatsnew note --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 11ea456b8799c..46c1d16410fd8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -829,7 +829,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` with ``ExtensionDtype`` columns incorrectly raising (:issue:`43561`) - Bug in :meth:`Series.unstack` with object doing unwanted type inference on resulting columns (:issue:`44595`) - Bug in :class:`MultiIndex` failing join operations with overlapping ``IntervalIndex`` levels (:issue:`44096`) -- Bug in :func:`replace` results is different ``dtype`` based on ``regex`` parameter (:issue:`44864`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` results is different ``dtype`` based on ``regex`` parameter (:issue:`44864`) Sparse ^^^^^^