From e37242467bba9a04f2dcb23f59c4c9c4d31a13ae Mon Sep 17 00:00:00 2001 From: Abhishek Date: Wed, 30 Oct 2024 21:40:39 +0530 Subject: [PATCH 1/8] bug fix for numpy.uintc in merge operations on windows Added pytest test case to verify correct behavior with numpy.uintc dtype --- pandas/core/reshape/merge.py | 12 +++++++++++- pandas/tests/reshape/merge/test_merge.py | 13 +++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 07e8fa4841c04..0ca8661ad3b5c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -123,7 +123,17 @@ # See https://github.com/pandas-dev/pandas/issues/52451 if np.intc is not np.int32: - _factorizers[np.intc] = libhashtable.Int64Factorizer + if np.dtype(np.intc).itemsize == 4: + _factorizers[np.intc] = libhashtable.Int32Factorizer + else: + _factorizers[np.intc] = libhashtable.Int64Factorizer + +if np.uintc is not np.uint32: + if np.dtype(np.uintc).itemsize == 4: + _factorizers[np.uintc] = libhashtable.UInt32Factorizer + else: + _factorizers[np.uintc] = libhashtable.UInt64Factorizer + _known = (np.ndarray, ExtensionArray, Index, ABCSeries) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d4766242b8460..cb71445671299 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1842,6 +1842,19 @@ def test_merge_empty(self, left_empty, how, exp): expected = expected.sort_values("A", ignore_index=True) tm.assert_frame_equal(result, expected) + + def test_merge_with_uintc_columns(dataframes_with_uintc): + """To test if pd.merge works with numpy.uintc on windows""" + + df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': np.array([1, 2], dtype=np.uintc)}) + df2 = pd.DataFrame({'a': ['foo', 'baz'], 'b': np.array([3, 4], dtype=np.uintc)}) + result = df1.merge(df2, how='outer') + expected = pd.DataFrame({ + 'a': ['bar', 'baz', 'foo','foo'], + 'b': np.array([2,4,1,3],dtype=np.uintc) + }) + tm.assert_frame_equal(result.reset_index(drop=True), expected) + @pytest.fixture From 9b023e67447384eb34fd617c80790d3ac1b39f66 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Wed, 30 Oct 2024 22:19:16 +0530 Subject: [PATCH 2/8] Formatting changes after running pre-commit --- pandas/tests/reshape/merge/test_merge.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index cb71445671299..ceb6c2deecd61 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1842,19 +1842,20 @@ def test_merge_empty(self, left_empty, how, exp): expected = expected.sort_values("A", ignore_index=True) tm.assert_frame_equal(result, expected) - + def test_merge_with_uintc_columns(dataframes_with_uintc): """To test if pd.merge works with numpy.uintc on windows""" - - df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': np.array([1, 2], dtype=np.uintc)}) - df2 = pd.DataFrame({'a': ['foo', 'baz'], 'b': np.array([3, 4], dtype=np.uintc)}) - result = df1.merge(df2, how='outer') - expected = pd.DataFrame({ - 'a': ['bar', 'baz', 'foo','foo'], - 'b': np.array([2,4,1,3],dtype=np.uintc) - }) + + df1 = pd.DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.uintc)}) + df2 = pd.DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.uintc)}) + result = df1.merge(df2, how="outer") + expected = pd.DataFrame( + { + "a": ["bar", "baz", "foo", "foo"], + "b": np.array([2, 4, 1, 3], dtype=np.uintc), + } + ) tm.assert_frame_equal(result.reset_index(drop=True), expected) - @pytest.fixture From 9a3a01963e04333737f1b7333d7474fbdcec060a Mon Sep 17 00:00:00 2001 From: Abhishek Date: Thu, 31 Oct 2024 10:33:49 +0530 Subject: [PATCH 3/8] Added tests for numpy.intc --- pandas/tests/reshape/merge/test_merge.py | 30 +++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ceb6c2deecd61..7d05e271a898f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1843,9 +1843,7 @@ def test_merge_empty(self, left_empty, how, exp): tm.assert_frame_equal(result, expected) - def test_merge_with_uintc_columns(dataframes_with_uintc): - """To test if pd.merge works with numpy.uintc on windows""" - + def test_merge_with_uintc_columns(self): df1 = pd.DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.uintc)}) df2 = pd.DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.uintc)}) result = df1.merge(df2, how="outer") @@ -1856,6 +1854,32 @@ def test_merge_with_uintc_columns(dataframes_with_uintc): } ) tm.assert_frame_equal(result.reset_index(drop=True), expected) + + def test_merge_with_intc_columns(self): + df1 = pd.DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.intc)}) + df2 = pd.DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.intc)}) + result = df1.merge(df2, how="outer") + expected = pd.DataFrame( + { + "a": ["bar", "baz", "foo", "foo"], + "b": np.array([2, 4, 1, 3], dtype=np.intc), + } + ) + tm.assert_frame_equal(result.reset_index(drop=True), expected) + + def test_merge_intc_non_monotonic(self): + df = pd.DataFrame({"join_key": pd.Series([0, 2, 1], dtype=np.intc)}) + df_details = pd.DataFrame({"join_key": pd.Series([0, 1, 2], dtype=np.intc),"value": ["a", "b", "c"]}) + merged = pd.merge(df, df_details, on="join_key", how="left") + expected = pd.DataFrame( + { + 'join_key':np.array([0,2,1],dtype=np.intc), + 'value':['a','c','b'] + } + ) + tm.assert_frame_equal(merged.reset_index(drop=True),expected) + + @pytest.fixture From d61de2e7f1b0c0e3cbae1ad3a9aca22b1fca7326 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Thu, 31 Oct 2024 11:05:29 +0530 Subject: [PATCH 4/8] added whatsnew note --- doc/source/whatsnew/v3.0.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 87d92f6618023..7af2cfff78ab7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -592,7 +592,6 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) -- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) @@ -739,6 +738,8 @@ Reshaping - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) +- Bug in :meth:`DataFrame.merge` when merging two dataframes with column dtype as numpy.uintc resulting in KeyError: ,Only on windows (:issue:`58713`) +- Bug in :meth:`DataFrame.merge` when merging two dataframes with column dtype as numpy.intc resulting in ValueError: Buffer dtype mismatch, Only on windows (:issue:`60091`) Sparse ^^^^^^ From db7faa6cd4c4fb11c385ca313bb24a00b0854481 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Thu, 31 Oct 2024 11:23:13 +0530 Subject: [PATCH 5/8] pre-commit automatic changes and also made changes to test_merge.py file to make pandas namespace consistent --- doc/source/whatsnew/v3.0.0.rst | 4 +-- pandas/tests/reshape/merge/test_merge.py | 36 +++++++++++------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7af2cfff78ab7..d0e3fc923e574 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -736,10 +736,10 @@ Reshaping - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) +- Bug in :meth:`DataFrame.merge` when merging two dataframes with column dtype as numpy.intc resulting in ValueError: Buffer dtype mismatch, Only on windows (:issue:`60091`) +- Bug in :meth:`DataFrame.merge` when merging two dataframes with column dtype as numpy.uintc resulting in KeyError: ,Only on windows (:issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) -- Bug in :meth:`DataFrame.merge` when merging two dataframes with column dtype as numpy.uintc resulting in KeyError: ,Only on windows (:issue:`58713`) -- Bug in :meth:`DataFrame.merge` when merging two dataframes with column dtype as numpy.intc resulting in ValueError: Buffer dtype mismatch, Only on windows (:issue:`60091`) Sparse ^^^^^^ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7d05e271a898f..82107e979d744 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1844,42 +1844,40 @@ def test_merge_empty(self, left_empty, how, exp): tm.assert_frame_equal(result, expected) def test_merge_with_uintc_columns(self): - df1 = pd.DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.uintc)}) - df2 = pd.DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.uintc)}) + df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.uintc)}) + df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.uintc)}) result = df1.merge(df2, how="outer") - expected = pd.DataFrame( + expected = DataFrame( { "a": ["bar", "baz", "foo", "foo"], "b": np.array([2, 4, 1, 3], dtype=np.uintc), } ) tm.assert_frame_equal(result.reset_index(drop=True), expected) - + def test_merge_with_intc_columns(self): - df1 = pd.DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.intc)}) - df2 = pd.DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.intc)}) + df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.intc)}) + df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.intc)}) result = df1.merge(df2, how="outer") - expected = pd.DataFrame( + expected = DataFrame( { "a": ["bar", "baz", "foo", "foo"], "b": np.array([2, 4, 1, 3], dtype=np.intc), } ) tm.assert_frame_equal(result.reset_index(drop=True), expected) - + def test_merge_intc_non_monotonic(self): - df = pd.DataFrame({"join_key": pd.Series([0, 2, 1], dtype=np.intc)}) - df_details = pd.DataFrame({"join_key": pd.Series([0, 1, 2], dtype=np.intc),"value": ["a", "b", "c"]}) - merged = pd.merge(df, df_details, on="join_key", how="left") - expected = pd.DataFrame( - { - 'join_key':np.array([0,2,1],dtype=np.intc), - 'value':['a','c','b'] - } + df = DataFrame({"join_key": Series([0, 2, 1], dtype=np.intc)}) + df_details = DataFrame( + {"join_key": Series([0, 1, 2], dtype=np.intc), "value": ["a", "b", "c"]} ) - tm.assert_frame_equal(merged.reset_index(drop=True),expected) - - + # merged = pd.merge(df, df_details, on="join_key", how="left") + merged = df.merge(df_details, on="join_key", how="left") + expected = DataFrame( + {"join_key": np.array([0, 2, 1], dtype=np.intc), "value": ["a", "c", "b"]} + ) + tm.assert_frame_equal(merged.reset_index(drop=True), expected) @pytest.fixture From aea3e8079c5d5e9f363124ddd24b3f12022bead1 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Thu, 31 Oct 2024 11:39:12 +0530 Subject: [PATCH 6/8] removed comment --- pandas/tests/reshape/merge/test_merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 82107e979d744..f0abc1afc6ab0 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1872,7 +1872,6 @@ def test_merge_intc_non_monotonic(self): df_details = DataFrame( {"join_key": Series([0, 1, 2], dtype=np.intc), "value": ["a", "b", "c"]} ) - # merged = pd.merge(df, df_details, on="join_key", how="left") merged = df.merge(df_details, on="join_key", how="left") expected = DataFrame( {"join_key": np.array([0, 2, 1], dtype=np.intc), "value": ["a", "c", "b"]} From dcf77c833fb8ca2d7375511316496a15cd6437f0 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Fri, 1 Nov 2024 11:56:39 +0530 Subject: [PATCH 7/8] added the deleted whatsnew note back --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 377273c60a18f..c0fb33b6d5439 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -593,6 +593,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) +- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) From 419dc6b5c3dc01bbb47f605509f5864bf0f15a59 Mon Sep 17 00:00:00 2001 From: Abhishek Chaudhari <91185083+AbhishekChaudharii@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:00:45 +0530 Subject: [PATCH 8/8] better whatsnew note Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c0fb33b6d5439..2e64c66812306 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -739,8 +739,7 @@ Reshaping - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) -- Bug in :meth:`DataFrame.merge` when merging two dataframes with column dtype as numpy.intc resulting in ValueError: Buffer dtype mismatch, Only on windows (:issue:`60091`) -- Bug in :meth:`DataFrame.merge` when merging two dataframes with column dtype as numpy.uintc resulting in KeyError: ,Only on windows (:issue:`58713`) +- Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)