From dee7a47a90426975d975cce189bdeb11de488091 Mon Sep 17 00:00:00 2001 From: mohammed benyamna Date: Fri, 2 May 2025 12:16:38 +0100 Subject: [PATCH 1/3] MultiIndex.difference not working with PyArrow timestamps(#61382) --- pandas/core/indexes/multi.py | 54 +++++++++++++++ pandas/tests/frame/test_query_eval.py | 22 ++---- pandas/tests/indexes/multi/test_setops.py | 33 +++++++++ ...check_for_inconsistent_pandas_namespace.py | 3 +- scripts/check_test_naming.py | 1 + scripts/generate_pip_deps_from_conda.py | 1 + scripts/pandas_errors_documented.py | 1 + scripts/sort_whatsnew_note.py | 1 + scripts/tests/test_check_test_naming.py | 5 +- .../test_inconsistent_namespace_check.py | 8 +-- scripts/tests/test_validate_docstrings.py | 20 +++--- scripts/validate_docstrings.py | 69 ++++++++++--------- scripts/validate_exception_location.py | 1 + scripts/validate_min_versions_in_sync.py | 3 +- scripts/validate_rst_title_capitalization.py | 4 +- scripts/validate_unwanted_patterns.py | 44 +++++------- 16 files changed, 170 insertions(+), 100 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 29b34f560ab2e..05779c9c8a13a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3891,6 +3891,60 @@ def equal_levels(self, other: MultiIndex) -> bool: # -------------------------------------------------------------------- # Set Methods + def difference(self, other, sort=None): + """ + Return a new MultiIndex with elements from the index not in `other`. + + Parameters + ---------- + other : MultiIndex or array-like + sort : bool or None, default None + Whether to sort the resulting index. + + Returns + ------- + MultiIndex + """ + if not isinstance(other, MultiIndex): + other = MultiIndex.from_tuples(other, names=self.names) + + # Convert 'other' to codes using self's levels + other_codes = [] + for i, (lev, name) in enumerate(zip(self.levels, self.names)): + level_vals = other.get_level_values(i) + other_code = lev.get_indexer(level_vals) + other_codes.append(other_code) + + # Create mask for elements not in 'other' + n = len(self) + mask = np.ones(n, dtype=bool) + engine = self._engine + for codes in zip(*other_codes): + try: + loc = engine.get_loc(tuple(codes)) + if isinstance(loc, slice): + mask[loc] = False + elif isinstance(loc, np.ndarray): + mask &= ~loc + else: + mask[loc] = False + except KeyError: + pass + + new_codes = [code[mask] for code in self.codes] + result = MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + verify_integrity=False, + ) + if sort is None or sort is True: + try: + return result.sort_values() + except TypeError: + pass + return result + def _union(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) if other.has_duplicates: diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index f93105498ac79..a565e9b214903 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -160,21 +160,13 @@ def test_query_empty_string(self): df.query("") def test_query_duplicate_column_name(self, engine, parser): - df = DataFrame( - { - "A": range(3), - "B": range(3), - "C": range(3) - } - ).rename(columns={"B": "A"}) + df = DataFrame({"A": range(3), "B": range(3), "C": range(3)}).rename( + columns={"B": "A"} + ) - res = df.query('C == 1', engine=engine, parser=parser) + res = df.query("C == 1", engine=engine, parser=parser) - expect = DataFrame( - [[1, 1, 1]], - columns=["A", "A", "C"], - index=[1] - ) + expect = DataFrame([[1, 1, 1]], columns=["A", "A", "C"], index=[1]) tm.assert_frame_equal(res, expect) @@ -1140,9 +1132,7 @@ def test_query_with_nested_special_character(self, parser, engine): [">=", operator.ge], ], ) - def test_query_lex_compare_strings( - self, parser, engine, op, func - ): + def test_query_lex_compare_strings(self, parser, engine, op, func): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index f7544cf62e5fa..ad9d849fc7c79 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -195,6 +195,39 @@ def test_difference(idx, sort): first.difference([1, 2, 3, 4, 5], sort=sort) +def test_multiindex_difference_pyarrow_timestamp(): + pa = pytest.importorskip("pyarrow") + + df = ( + DataFrame( + [(1, "1900-01-01", "a"), (2, "1900-01-01", "b")], + columns=["id", "date", "val"], + ) + .astype( + { + "id": "int64[pyarrow]", + "date": "timestamp[ns][pyarrow]", + "val": "string[pyarrow]", + } + ) + .set_index(["id", "date"]) + ) + + idx = df.index + idx_val = idx[0] + + # Assert the value exists in the original index + assert idx_val in idx + + # Remove idx_val using difference() + new_idx = idx.difference([idx_val]) + + # Verify the result + assert len(new_idx) == 1 + assert idx_val not in new_idx + assert new_idx.equals(MultiIndex.from_tuples([(2, pd.Timestamp("1900-01-01"))])) + + def test_difference_sort_special(): # GH-24959 idx = MultiIndex.from_product([[1, 0], ["a", "b"]]) diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py index ec0a4a408c800..39e5fd2955e0a 100644 --- a/scripts/check_for_inconsistent_pandas_namespace.py +++ b/scripts/check_for_inconsistent_pandas_namespace.py @@ -30,8 +30,7 @@ from typing import NamedTuple ERROR_MESSAGE = ( - "{path}:{lineno}:{col_offset}: " - "Found both '{prefix}.{name}' and '{name}' in {path}" + "{path}:{lineno}:{col_offset}: Found both '{prefix}.{name}' and '{name}' in {path}" ) diff --git a/scripts/check_test_naming.py b/scripts/check_test_naming.py index f9190643b3246..629687a866508 100644 --- a/scripts/check_test_naming.py +++ b/scripts/check_test_naming.py @@ -8,6 +8,7 @@ NOTE: if this finds a false positive, you can add the comment `# not a test` to the class or function definition. Though hopefully that shouldn't be necessary. """ + from __future__ import annotations import argparse diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index a57876902ad36..4b416a2b32319 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -12,6 +12,7 @@ generated with this script: $ python scripts/generate_pip_deps_from_conda.py --compare """ + import argparse import pathlib import re diff --git a/scripts/pandas_errors_documented.py b/scripts/pandas_errors_documented.py index b68da137717de..a4716b70e5f26 100644 --- a/scripts/pandas_errors_documented.py +++ b/scripts/pandas_errors_documented.py @@ -6,6 +6,7 @@ pre-commit run pandas-errors-documented --all-files """ + from __future__ import annotations import argparse diff --git a/scripts/sort_whatsnew_note.py b/scripts/sort_whatsnew_note.py index 428ffca83ea26..3e23d88ef92d3 100644 --- a/scripts/sort_whatsnew_note.py +++ b/scripts/sort_whatsnew_note.py @@ -23,6 +23,7 @@ pre-commit run sort-whatsnew-items --all-files """ + from __future__ import annotations import argparse diff --git a/scripts/tests/test_check_test_naming.py b/scripts/tests/test_check_test_naming.py index dbd803ce4dd31..02c31ddef2ba2 100644 --- a/scripts/tests/test_check_test_naming.py +++ b/scripts/tests/test_check_test_naming.py @@ -24,10 +24,7 @@ 0, ), ( - "class Foo: # not a test\n" - " pass\n" - "def test_foo():\n" - " Class.foo()\n", + "class Foo: # not a test\n pass\ndef test_foo():\n Class.foo()\n", "", 0, ), diff --git a/scripts/tests/test_inconsistent_namespace_check.py b/scripts/tests/test_inconsistent_namespace_check.py index 64f66e6168efe..73893a3c86dac 100644 --- a/scripts/tests/test_inconsistent_namespace_check.py +++ b/scripts/tests/test_inconsistent_namespace_check.py @@ -5,14 +5,10 @@ ) BAD_FILE_0 = ( - "from pandas import Categorical\n" - "cat_0 = Categorical()\n" - "cat_1 = pd.Categorical()" + "from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = pd.Categorical()" ) BAD_FILE_1 = ( - "from pandas import Categorical\n" - "cat_0 = pd.Categorical()\n" - "cat_1 = Categorical()" + "from pandas import Categorical\ncat_0 = pd.Categorical()\ncat_1 = Categorical()" ) BAD_FILE_2 = ( "from pandas import Categorical\n" diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 3bffd1f1987aa..381baa1f666f1 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -34,8 +34,7 @@ def redundant_import(self, paramx=None, paramy=None) -> None: -------- >>> import numpy as np >>> import pandas as pd - >>> df = pd.DataFrame(np.ones((3, 3)), - ... columns=('a', 'b', 'c')) + >>> df = pd.DataFrame(np.ones((3, 3)), columns=("a", "b", "c")) >>> df.all(axis=1) 0 True 1 True @@ -50,14 +49,14 @@ def unused_import(self) -> None: Examples -------- >>> import pandas as pdf - >>> df = pd.DataFrame(np.ones((3, 3)), columns=('a', 'b', 'c')) + >>> df = pd.DataFrame(np.ones((3, 3)), columns=("a", "b", "c")) """ def missing_whitespace_around_arithmetic_operator(self) -> None: """ Examples -------- - >>> 2+5 + >>> 2 + 5 7 """ @@ -66,14 +65,14 @@ def indentation_is_not_a_multiple_of_four(self) -> None: Examples -------- >>> if 2 + 5: - ... pass + ... pass """ def missing_whitespace_after_comma(self) -> None: """ Examples -------- - >>> df = pd.DataFrame(np.ones((3,3)),columns=('a','b', 'c')) + >>> df = pd.DataFrame(np.ones((3, 3)), columns=("a", "b", "c")) """ def write_array_like_with_hyphen_not_underscore(self) -> None: @@ -227,13 +226,13 @@ def test_validate_all_ignore_errors(self, monkeypatch): "errors": [ ("ER01", "err desc"), ("ER02", "err desc"), - ("ER03", "err desc") + ("ER03", "err desc"), ], "warnings": [], "examples_errors": "", "deprecated": True, "file": "file1", - "file_line": "file_line1" + "file_line": "file_line1", }, ) monkeypatch.setattr( @@ -272,14 +271,13 @@ def test_validate_all_ignore_errors(self, monkeypatch): None: {"ER03"}, "pandas.DataFrame.align": {"ER01"}, # ignoring an error that is not requested should be of no effect - "pandas.Index.all": {"ER03"} - } + "pandas.Index.all": {"ER03"}, + }, ) # two functions * two not global ignored errors - one function ignored error assert exit_status == 2 * 2 - 1 - class TestApiItems: @property def api_doc(self): diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 944575dcc8659..c878820d75487 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -13,6 +13,7 @@ $ ./validate_docstrings.py $ ./validate_docstrings.py pandas.DataFrame.head """ + from __future__ import annotations import argparse @@ -69,8 +70,10 @@ } ALL_ERRORS = set(NUMPYDOC_ERROR_MSGS).union(set(ERROR_MSGS)) duplicated_errors = set(NUMPYDOC_ERROR_MSGS).intersection(set(ERROR_MSGS)) -assert not duplicated_errors, (f"Errors {duplicated_errors} exist in both pandas " - "and numpydoc, should they be removed from pandas?") +assert not duplicated_errors, ( + f"Errors {duplicated_errors} exist in both pandas " + "and numpydoc, should they be removed from pandas?" +) def pandas_error(code, **kwargs): @@ -257,7 +260,7 @@ def pandas_validate(func_name: str): pandas_error( "SA05", reference_name=rel_name, - right_reference=rel_name[len("pandas."):], + right_reference=rel_name[len("pandas.") :], ) for rel_name in doc.see_also if rel_name.startswith("pandas.") @@ -365,17 +368,18 @@ def print_validate_all_results( for func_name, res in result.items(): error_messages = dict(res["errors"]) actual_failures = set(error_messages) - expected_failures = (ignore_errors.get(func_name, set()) - | ignore_errors.get(None, set())) + expected_failures = ignore_errors.get(func_name, set()) | ignore_errors.get( + None, set() + ) for err_code in actual_failures - expected_failures: sys.stdout.write( - f'{prefix}{res["file"]}:{res["file_line"]}:' - f'{err_code}:{func_name}:{error_messages[err_code]}\n' + f"{prefix}{res['file']}:{res['file_line']}:" + f"{err_code}:{func_name}:{error_messages[err_code]}\n" ) exit_status += 1 for err_code in ignore_errors.get(func_name, set()) - actual_failures: sys.stdout.write( - f'{prefix}{res["file"]}:{res["file_line"]}:' + f"{prefix}{res['file']}:{res['file_line']}:" f"{err_code}:{func_name}:" "EXPECTED TO FAIL, BUT NOT FAILING\n" ) @@ -384,8 +388,9 @@ def print_validate_all_results( return exit_status -def print_validate_one_results(func_name: str, - ignore_errors: dict[str, set[str]]) -> int: +def print_validate_one_results( + func_name: str, ignore_errors: dict[str, set[str]] +) -> int: def header(title, width=80, char="#") -> str: full_line = char * width side_len = (width - len(title) - 2) // 2 @@ -396,15 +401,18 @@ def header(title, width=80, char="#") -> str: result = pandas_validate(func_name) - result["errors"] = [(code, message) for code, message in result["errors"] - if code not in ignore_errors.get(None, set())] + result["errors"] = [ + (code, message) + for code, message in result["errors"] + if code not in ignore_errors.get(None, set()) + ] sys.stderr.write(header(f"Docstring ({func_name})")) sys.stderr.write(f"{result['docstring']}\n") sys.stderr.write(header("Validation")) if result["errors"]: - sys.stderr.write(f'{len(result["errors"])} Errors found for `{func_name}`:\n') + sys.stderr.write(f"{len(result['errors'])} Errors found for `{func_name}`:\n") for err_code, err_desc in result["errors"]: sys.stderr.write(f"\t{err_code}\t{err_desc}\n") else: @@ -431,14 +439,16 @@ def _format_ignore_errors(raw_ignore_errors): raise ValueError( f"Object `{obj_name}` is present in more than one " "--ignore_errors argument. Please use it once and specify " - "the errors separated by commas.") + "the errors separated by commas." + ) ignore_errors[obj_name] = set(error_codes.split(",")) unknown_errors = ignore_errors[obj_name] - ALL_ERRORS if unknown_errors: raise ValueError( f"Object `{obj_name}` is ignoring errors {unknown_errors} " - f"which are not known. Known errors are: {ALL_ERRORS}") + f"which are not known. Known errors are: {ALL_ERRORS}" + ) # global errors "PR02,ES01" else: @@ -448,27 +458,19 @@ def _format_ignore_errors(raw_ignore_errors): if unknown_errors: raise ValueError( f"Unknown errors {unknown_errors} specified using --ignore_errors " - "Known errors are: {ALL_ERRORS}") + "Known errors are: {ALL_ERRORS}" + ) return ignore_errors -def main( - func_name, - output_format, - prefix, - ignore_deprecated, - ignore_errors -): +def main(func_name, output_format, prefix, ignore_deprecated, ignore_errors): """ Main entry point. Call the validation for one or for all docstrings. """ if func_name is None: return print_validate_all_results( - output_format, - prefix, - ignore_deprecated, - ignore_errors + output_format, prefix, ignore_deprecated, ignore_errors ) else: return print_validate_one_results(func_name, ignore_errors) @@ -524,10 +526,11 @@ def main( args = argparser.parse_args(sys.argv[1:]) sys.exit( - main(args.function, - args.format, - args.prefix, - args.ignore_deprecated, - _format_ignore_errors(args.ignore_errors), - ) + main( + args.function, + args.format, + args.prefix, + args.ignore_deprecated, + _format_ignore_errors(args.ignore_errors), + ) ) diff --git a/scripts/validate_exception_location.py b/scripts/validate_exception_location.py index ecba1eb424ad5..8581a0c873f04 100644 --- a/scripts/validate_exception_location.py +++ b/scripts/validate_exception_location.py @@ -18,6 +18,7 @@ As a pre-commit hook: pre-commit run validate-errors-locations --all-files """ + from __future__ import annotations import argparse diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index 1001b00450354..7d5fea58b60ea 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -12,6 +12,7 @@ pre-commit run validate-min-versions-in-sync --all-files """ + from __future__ import annotations import pathlib @@ -105,7 +106,7 @@ def get_operator_from(dependency: str) -> str | None: def get_yaml_map_from( - yaml_dic: list[str | dict[str, list[str]]] + yaml_dic: list[str | dict[str, list[str]]], ) -> dict[str, list[str] | None]: yaml_map: dict[str, list[str] | None] = {} for dependency in yaml_dic: diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 73a90f4fca0f6..243f70b4b8fcd 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -11,6 +11,7 @@ From the command-line: python scripts/validate_rst_title_capitalization.py """ + from __future__ import annotations import argparse @@ -271,7 +272,8 @@ def main(source_paths: list[str]) -> int: if title != correct_title_capitalization(title): print( f"""{filename}:{line_number}:{err_msg} "{title}" to "{ - correct_title_capitalization(title)}" """ + correct_title_capitalization(title) + }" """ ) number_of_errors += 1 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index d804e15f6d48f..4e241c7eba659 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -179,17 +179,11 @@ def strings_with_wrong_placed_whitespace( For example: - >>> rule = ( - ... "We want the space at the end of the line, " - ... "not at the beginning" - ... ) + >>> rule = "We want the space at the end of the line, not at the beginning" Instead of: - >>> rule = ( - ... "We want the space at the end of the line," - ... " not at the beginning" - ... ) + >>> rule = "We want the space at the end of the line, not at the beginning" Parameters ---------- @@ -229,17 +223,11 @@ def has_wrong_whitespace(first_line: str, second_line: str) -> bool: For example, this is bad: - >>> rule = ( - ... "We want the space at the end of the line," - ... " not at the beginning" - ... ) + >>> rule = "We want the space at the end of the line, not at the beginning" And what we want is: - >>> rule = ( - ... "We want the space at the end of the line, " - ... "not at the beginning" - ... ) + >>> rule = "We want the space at the end of the line, not at the beginning" And if the string is ending with a new line character (\n) we do not want any trailing whitespaces after it. @@ -247,17 +235,17 @@ def has_wrong_whitespace(first_line: str, second_line: str) -> bool: For example, this is bad: >>> rule = ( - ... "We want the space at the begging of " - ... "the line if the previous line is ending with a \n " - ... "not at the end, like always" + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n " + ... "not at the end, like always" ... ) And what we do want is: >>> rule = ( - ... "We want the space at the begging of " - ... "the line if the previous line is ending with a \n" - ... " not at the end, like always" + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n" + ... " not at the end, like always" ... ) """ if first_line.endswith(r"\n"): @@ -319,10 +307,14 @@ def nodefault_used_not_only_for_typing(file_obj: IO[str]) -> Iterable[tuple[int, while nodes: in_annotation, node = nodes.pop() if not in_annotation and ( - (isinstance(node, ast.Name) # Case `NoDefault` - and node.id == "NoDefault") - or (isinstance(node, ast.Attribute) # Cases e.g. `lib.NoDefault` - and node.attr == "NoDefault") + ( + isinstance(node, ast.Name) # Case `NoDefault` + and node.id == "NoDefault" + ) + or ( + isinstance(node, ast.Attribute) # Cases e.g. `lib.NoDefault` + and node.attr == "NoDefault" + ) ): yield (node.lineno, "NoDefault is used not only for typing") From bcf1e0df9924fef0b2a13b64eea841ccb3ee8e71 Mon Sep 17 00:00:00 2001 From: mohammed benyamna Date: Fri, 2 May 2025 12:28:32 +0100 Subject: [PATCH 2/3] fix the unused variable in the test_setops.py --- pandas/tests/indexes/multi/test_setops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index ad9d849fc7c79..1bb69f32e310d 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -196,7 +196,7 @@ def test_difference(idx, sort): def test_multiindex_difference_pyarrow_timestamp(): - pa = pytest.importorskip("pyarrow") + pytest.importorskip("pyarrow") df = ( DataFrame( From 748596d7f1ff258b673a7c8fa17dae331bfd2ff5 Mon Sep 17 00:00:00 2001 From: mohammed benyamna Date: Fri, 2 May 2025 13:09:54 +0100 Subject: [PATCH 3/3] fix the unused variable in the test_setops.py --- pandas/tests/indexes/multi/test_setops.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 1bb69f32e310d..02dc2f16861c5 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -225,7 +225,25 @@ def test_multiindex_difference_pyarrow_timestamp(): # Verify the result assert len(new_idx) == 1 assert idx_val not in new_idx - assert new_idx.equals(MultiIndex.from_tuples([(2, pd.Timestamp("1900-01-01"))])) + + # Create expected index with the same PyArrow timestamp dtype + expected_df = ( + DataFrame( + [(2, "1900-01-01", "b")], + columns=["id", "date", "val"], + ) + .astype( + { + "id": "int64[pyarrow]", + "date": "timestamp[ns][pyarrow]", + "val": "string[pyarrow]", + } + ) + .set_index(["id", "date"]) + ) + expected = expected_df.index + + assert new_idx.equals(expected) def test_difference_sort_special():