From 9c613522dcef4dc42cc4f7bf3ab6f214bf4ca9a2 Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Tue, 19 May 2020 19:19:09 -0300 Subject: [PATCH 01/10] solving issue#34125, now the isin function is more consistent when comparing a int df with a string value on isin comparation --- pandas/core/algorithms.py | 9 ++++++++- pandas/tests/frame/methods/test_isin.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d270a6431be56..ad2b95fd7cd8e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -428,8 +428,14 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # TODO(extension) # handle categoricals return comps.isin(values) # type: ignore + try: + if comps.dtype.name == 'int64': + comps, dtype = _ensure_data(comps, dtype=object) + else: + comps, dtype = _ensure_data(comps) + except: + comps, dtype = _ensure_data(comps) - comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) # faster for larger cases to use np.in1d @@ -2084,3 +2090,4 @@ def sort_mixed(values): np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) + diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 6307738021f68..cd2db3497d6a3 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -189,3 +189,17 @@ def test_isin_empty_datetimelike(self): tm.assert_frame_equal(result, expected) result = df1_td.isin(df3) tm.assert_frame_equal(result, expected) + + def test_isin_int_df_string_search(self): + #Comparing df with int`s with a string at isin() -> should not match values + df = pd.DataFrame({"values": [1,2]}) + result = df.isin(["1"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + + #Comparing df with int`s with a int at isin() -> should find the value + result = df.isin([1]) + expected_true = pd.DataFrame({"values": [True, False]}) + tm.assert_frame_equal(result, expected_true) + + From e0ec44a46b13c5772ffb7fe227155caf0000d158 Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Tue, 19 May 2020 19:22:32 -0300 Subject: [PATCH 02/10] adding a line that i forgot --- pandas/tests/frame/methods/test_isin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index cd2db3497d6a3..3bf02212cc5bf 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -189,15 +189,15 @@ def test_isin_empty_datetimelike(self): tm.assert_frame_equal(result, expected) result = df1_td.isin(df3) tm.assert_frame_equal(result, expected) - + def test_isin_int_df_string_search(self): - #Comparing df with int`s with a string at isin() -> should not match values + #Comparing df with int`s (1,2) with a string at isin() ("1") -> should not match values because int 1 is not equal str 1 df = pd.DataFrame({"values": [1,2]}) result = df.isin(["1"]) expected_false = pd.DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) - #Comparing df with int`s with a int at isin() -> should find the value + #Comparing df with int`s with a int at isin() -> should be fine result = df.isin([1]) expected_true = pd.DataFrame({"values": [True, False]}) tm.assert_frame_equal(result, expected_true) From 80752adfc0245acd556ff66c07fcc0854abe32ad Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Tue, 19 May 2020 19:37:33 -0300 Subject: [PATCH 03/10] fixing some formatting things --- pandas/core/algorithms.py | 3 +-- pandas/tests/frame/methods/test_isin.py | 10 ++++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ad2b95fd7cd8e..c268ad40f2e0e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -429,7 +429,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # handle categoricals return comps.isin(values) # type: ignore try: - if comps.dtype.name == 'int64': + if comps.dtype.name == "int64": comps, dtype = _ensure_data(comps, dtype=object) else: comps, dtype = _ensure_data(comps) @@ -2090,4 +2090,3 @@ def sort_mixed(values): np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) - diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 3bf02212cc5bf..529119d3e6533 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -191,15 +191,13 @@ def test_isin_empty_datetimelike(self): tm.assert_frame_equal(result, expected) def test_isin_int_df_string_search(self): - #Comparing df with int`s (1,2) with a string at isin() ("1") -> should not match values because int 1 is not equal str 1 - df = pd.DataFrame({"values": [1,2]}) + # Comparing df with int`s (1,2) with a string at isin() ("1") -> should not match values because int 1 is not equal str 1 + df = DataFrame({"values": [1, 2]}) result = df.isin(["1"]) expected_false = pd.DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) - #Comparing df with int`s with a int at isin() -> should be fine + # Comparing df with int`s with a int at isin() -> should be fine result = df.isin([1]) - expected_true = pd.DataFrame({"values": [True, False]}) + expected_true = DataFrame({"values": [True, False]}) tm.assert_frame_equal(result, expected_true) - - From 60000cac1ec3d9f91b86a5ac1d337219eaf805a6 Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Tue, 19 May 2020 20:28:11 -0300 Subject: [PATCH 04/10] removing try and execpt for better look --- pandas/core/algorithms.py | 5 +++-- pandas/tests/frame/methods/test_isin.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c268ad40f2e0e..feee8e600d2f0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -428,12 +428,13 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # TODO(extension) # handle categoricals return comps.isin(values) # type: ignore - try: + + if hasattr(comps, "dtype") and hasattr(comps.dtype, "name"): if comps.dtype.name == "int64": comps, dtype = _ensure_data(comps, dtype=object) else: comps, dtype = _ensure_data(comps) - except: + else: comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 529119d3e6533..e5180065c0c10 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -191,7 +191,8 @@ def test_isin_empty_datetimelike(self): tm.assert_frame_equal(result, expected) def test_isin_int_df_string_search(self): - # Comparing df with int`s (1,2) with a string at isin() ("1") -> should not match values because int 1 is not equal str 1 + """Comparing df with int`s (1,2) with a string at isin() ("1") + -> should not match values because int 1 is not equal str 1""" df = DataFrame({"values": [1, 2]}) result = df.isin(["1"]) expected_false = pd.DataFrame({"values": [False, False]}) From 47e5962f72b597291178dcfb2680806dcc9fb4e7 Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Sun, 31 May 2020 20:58:39 -0300 Subject: [PATCH 05/10] Update algorithms.py --- pandas/core/algorithms.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index feee8e600d2f0..d270a6431be56 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -429,14 +429,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # handle categoricals return comps.isin(values) # type: ignore - if hasattr(comps, "dtype") and hasattr(comps.dtype, "name"): - if comps.dtype.name == "int64": - comps, dtype = _ensure_data(comps, dtype=object) - else: - comps, dtype = _ensure_data(comps) - else: - comps, dtype = _ensure_data(comps) - + comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) # faster for larger cases to use np.in1d From 27333365324160d0189952307eaec7168677c154 Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Mon, 1 Jun 2020 21:10:39 -0300 Subject: [PATCH 06/10] Some xfaileds tests addeds for identify the bad comportment of isin func, related with issue 34125 --- pandas/tests/frame/methods/test_isin.py | 13 ------------ pandas/tests/test_algos.py | 27 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index e5180065c0c10..6307738021f68 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -189,16 +189,3 @@ def test_isin_empty_datetimelike(self): tm.assert_frame_equal(result, expected) result = df1_td.isin(df3) tm.assert_frame_equal(result, expected) - - def test_isin_int_df_string_search(self): - """Comparing df with int`s (1,2) with a string at isin() ("1") - -> should not match values because int 1 is not equal str 1""" - df = DataFrame({"values": [1, 2]}) - result = df.isin(["1"]) - expected_false = pd.DataFrame({"values": [False, False]}) - tm.assert_frame_equal(result, expected_false) - - # Comparing df with int`s with a int at isin() -> should be fine - result = df.isin([1]) - expected_true = DataFrame({"values": [True, False]}) - tm.assert_frame_equal(result, expected_true) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6008b4f9f4e33..feafc45e68761 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2374,3 +2374,30 @@ def test_index(self): dtype="timedelta64[ns]", ) tm.assert_series_equal(algos.mode(idx), exp) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_int_df_string_search(self): + """Comparing df with int`s (1,2) with a string at isin() ("1") + -> should not match values because int 1 is not equal str 1""" + df = pd.DataFrame({"values": [1, 2]}) + result = df.isin(["1"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_nan_df_string_search(self): + """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") + -> should not match values because np.nan is not equal str NaN """ + df = DataFrame({"values": [np.nan, 2]}) + result = df.isin(["NaN"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_float_df_string_search(self): + """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") + -> should not match values because float 1.4245 is not equal str 1.4245""" + df = DataFrame({"values": [1.4245, 2.32441]}) + result = df.isin(["1.4245"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) From d90af92fe8ea5829869ed372a83fb0d4c1937009 Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Mon, 1 Jun 2020 22:26:02 -0300 Subject: [PATCH 07/10] Update test_algos.py --- pandas/tests/test_algos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index feafc45e68761..f3f256afc1344 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2388,7 +2388,7 @@ def test_isin_int_df_string_search(self): def test_isin_nan_df_string_search(self): """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") -> should not match values because np.nan is not equal str NaN """ - df = DataFrame({"values": [np.nan, 2]}) + df = pd.DataFrame({"values": [np.nan, 2]}) result = df.isin(["NaN"]) expected_false = pd.DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) @@ -2397,7 +2397,7 @@ def test_isin_nan_df_string_search(self): def test_isin_float_df_string_search(self): """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") -> should not match values because float 1.4245 is not equal str 1.4245""" - df = DataFrame({"values": [1.4245, 2.32441]}) + df = pd.DataFrame({"values": [1.4245, 2.32441]}) result = df.isin(["1.4245"]) expected_false = pd.DataFrame({"values": [False, False]}) tm.assert_frame_equal(result, expected_false) From e3adf5dc6be80259bf5dc7430d77baac3531d3f2 Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Tue, 2 Jun 2020 16:46:10 -0300 Subject: [PATCH 08/10] Update test_algos.py --- pandas/tests/test_algos.py | 54 +++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f3f256afc1344..ff5f890cc41f8 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -944,6 +944,33 @@ def test_different_nans_as_float64(self): expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_int_df_string_search(self): + """Comparing df with int`s (1,2) with a string at isin() ("1") + -> should not match values because int 1 is not equal str 1""" + df = pd.DataFrame({"values": [1, 2]}) + result = df.isin(["1"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_nan_df_string_search(self): + """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") + -> should not match values because np.nan is not equal str NaN """ + df = pd.DataFrame({"values": [np.nan, 2]}) + result = df.isin(["NaN"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_float_df_string_search(self): + """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") + -> should not match values because float 1.4245 is not equal str 1.4245""" + df = pd.DataFrame({"values": [1.4245, 2.32441]}) + result = df.isin(["1.4245"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + class TestValueCounts: def test_value_counts(self): @@ -2374,30 +2401,3 @@ def test_index(self): dtype="timedelta64[ns]", ) tm.assert_series_equal(algos.mode(idx), exp) - - @pytest.mark.xfail(reason="problem related with issue #34125") - def test_isin_int_df_string_search(self): - """Comparing df with int`s (1,2) with a string at isin() ("1") - -> should not match values because int 1 is not equal str 1""" - df = pd.DataFrame({"values": [1, 2]}) - result = df.isin(["1"]) - expected_false = pd.DataFrame({"values": [False, False]}) - tm.assert_frame_equal(result, expected_false) - - @pytest.mark.xfail(reason="problem related with issue #34125") - def test_isin_nan_df_string_search(self): - """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") - -> should not match values because np.nan is not equal str NaN """ - df = pd.DataFrame({"values": [np.nan, 2]}) - result = df.isin(["NaN"]) - expected_false = pd.DataFrame({"values": [False, False]}) - tm.assert_frame_equal(result, expected_false) - - @pytest.mark.xfail(reason="problem related with issue #34125") - def test_isin_float_df_string_search(self): - """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") - -> should not match values because float 1.4245 is not equal str 1.4245""" - df = pd.DataFrame({"values": [1.4245, 2.32441]}) - result = df.isin(["1.4245"]) - expected_false = pd.DataFrame({"values": [False, False]}) - tm.assert_frame_equal(result, expected_false) From 2771e3799623e459548feb73ca7dbe22d2820a21 Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Mon, 1 Jun 2020 21:10:39 -0300 Subject: [PATCH 09/10] Tests added to the right part of the code (class TestIsin) --- pandas/tests/frame/methods/test_isin.py | 13 ------------ pandas/tests/test_algos.py | 27 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index e5180065c0c10..6307738021f68 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -189,16 +189,3 @@ def test_isin_empty_datetimelike(self): tm.assert_frame_equal(result, expected) result = df1_td.isin(df3) tm.assert_frame_equal(result, expected) - - def test_isin_int_df_string_search(self): - """Comparing df with int`s (1,2) with a string at isin() ("1") - -> should not match values because int 1 is not equal str 1""" - df = DataFrame({"values": [1, 2]}) - result = df.isin(["1"]) - expected_false = pd.DataFrame({"values": [False, False]}) - tm.assert_frame_equal(result, expected_false) - - # Comparing df with int`s with a int at isin() -> should be fine - result = df.isin([1]) - expected_true = DataFrame({"values": [True, False]}) - tm.assert_frame_equal(result, expected_true) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6008b4f9f4e33..feafc45e68761 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2374,3 +2374,30 @@ def test_index(self): dtype="timedelta64[ns]", ) tm.assert_series_equal(algos.mode(idx), exp) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_int_df_string_search(self): + """Comparing df with int`s (1,2) with a string at isin() ("1") + -> should not match values because int 1 is not equal str 1""" + df = pd.DataFrame({"values": [1, 2]}) + result = df.isin(["1"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_nan_df_string_search(self): + """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") + -> should not match values because np.nan is not equal str NaN """ + df = DataFrame({"values": [np.nan, 2]}) + result = df.isin(["NaN"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_float_df_string_search(self): + """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") + -> should not match values because float 1.4245 is not equal str 1.4245""" + df = DataFrame({"values": [1.4245, 2.32441]}) + result = df.isin(["1.4245"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) From 478c099048b81886fdd609ddd31355d537ade18a Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Tue, 2 Jun 2020 17:19:30 -0300 Subject: [PATCH 10/10] Update test_algos.py Some fix just that i forgot to delete some code --- pandas/tests/test_algos.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fc4299e9b6c7f..ff5f890cc41f8 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2401,30 +2401,3 @@ def test_index(self): dtype="timedelta64[ns]", ) tm.assert_series_equal(algos.mode(idx), exp) - - @pytest.mark.xfail(reason="problem related with issue #34125") - def test_isin_int_df_string_search(self): - """Comparing df with int`s (1,2) with a string at isin() ("1") - -> should not match values because int 1 is not equal str 1""" - df = pd.DataFrame({"values": [1, 2]}) - result = df.isin(["1"]) - expected_false = pd.DataFrame({"values": [False, False]}) - tm.assert_frame_equal(result, expected_false) - - @pytest.mark.xfail(reason="problem related with issue #34125") - def test_isin_nan_df_string_search(self): - """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") - -> should not match values because np.nan is not equal str NaN """ - df = DataFrame({"values": [np.nan, 2]}) - result = df.isin(["NaN"]) - expected_false = pd.DataFrame({"values": [False, False]}) - tm.assert_frame_equal(result, expected_false) - - @pytest.mark.xfail(reason="problem related with issue #34125") - def test_isin_float_df_string_search(self): - """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") - -> should not match values because float 1.4245 is not equal str 1.4245""" - df = DataFrame({"values": [1.4245, 2.32441]}) - result = df.isin(["1.4245"]) - expected_false = pd.DataFrame({"values": [False, False]}) - tm.assert_frame_equal(result, expected_false)