diff --git a/pandas/conftest.py b/pandas/conftest.py index f9c10a7758bd2..9db58c9a82dd3 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -706,6 +706,7 @@ def _create_mi_with_dt64tz_level(): "string-python": Index( pd.array([f"pandas_{i}" for i in range(10)], dtype="string[python]") ), + "mixed-int-string": Index([0, "a", 1, "b", 2, "c"]), } if has_pyarrow: idx = Index(pd.array([f"pandas_{i}" for i in range(10)], dtype="string[pyarrow]")) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 7819b7b75f065..219c8e96a7f4e 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -147,19 +147,29 @@ def test_searchsorted(request, index_or_series_obj): # See gh-12238 obj = index_or_series_obj + # 1. Check for multi-index if isinstance(obj, pd.MultiIndex): - # See gh-14833 - request.applymarker( - pytest.mark.xfail( - reason="np.searchsorted doesn't work on pd.MultiIndex: GH 14833" - ) - ) - elif obj.dtype.kind == "c" and isinstance(obj, Index): - # TODO: Should Series cases also raise? Looks like they use numpy - # comparison semantics https://github.com/numpy/numpy/issues/15981 - mark = pytest.mark.xfail(reason="complex objects are not comparable") - request.applymarker(mark) - + request.applymarker(pytest.mark.xfail(reason="GH 14833", strict=False)) + return + + # 2. Check for Index and subtypes + if isinstance(obj, Index): + # 2a. Mixed types + if obj.inferred_type in ["mixed", "mixed-integer"]: + try: + obj = obj.astype(str) + except (TypeError, ValueError): + request.applymarker( + pytest.mark.xfail(reason="Mixed types", strict=False) + ) + return + + # 2b. Complex types + elif obj.dtype.kind == "c": + request.applymarker(pytest.mark.xfail(reason="Complex types", strict=False)) + return + + # 3. Run test ONLY if there isn't mixed/complex types max_obj = max(obj, default=0) index = np.searchsorted(obj, max_obj) assert 0 <= index <= len(obj) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index bcb31829a201f..6496680748c77 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -63,6 +63,9 @@ def test_value_counts_null(null_obj, index_or_series_obj): elif isinstance(orig, MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") + if obj.dtype == "object": + obj = obj.astype(str) + values = obj._values values[0:2] = null_obj diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index f7544cf62e5fa..bdf3becfbddde 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -626,11 +626,16 @@ def test_union_with_duplicates_keep_ea_dtype(dupe_val, any_numeric_ea_dtype): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_duplicates(index, request): + # special case for mixed types + if index.equals(Index([0, "a", 1, "b", 2, "c"])): + index = index.map(str) + # GH#38977 if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)): pytest.skip(f"No duplicates in an empty {type(index).__name__}") values = index.unique().values.tolist() + values = [str(v) for v in values] mi1 = MultiIndex.from_arrays([values, [1] * len(values)]) mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)]) result = mi2.union(mi1) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index bf16554871efc..b2248f5e3c58e 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -440,6 +440,9 @@ def test_hasnans_isnans(self, index_flat): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize("na_position", [None, "middle"]) def test_sort_values_invalid_na_position(index_with_missing, na_position): + if len({type(x) for x in index_with_missing if pd.notna(x)}) > 1: + index_with_missing = index_with_missing.map(str) + with pytest.raises(ValueError, match=f"invalid na_position: {na_position}"): index_with_missing.sort_values(na_position=na_position) @@ -450,6 +453,10 @@ def test_sort_values_with_missing(index_with_missing, na_position, request): # GH 35584. Test that sort_values works with missing values, # sort non-missing and place missing according to na_position + non_na_values = [x for x in index_with_missing if pd.notna(x)] + if len({type(x) for x in non_na_values}) > 1: + index_with_missing = index_with_missing.map(str) + if isinstance(index_with_missing, CategoricalIndex): request.applymarker( pytest.mark.xfail( diff --git a/pandas/tests/indexes/test_mixed_int_string.py b/pandas/tests/indexes/test_mixed_int_string.py new file mode 100644 index 0000000000000..f0f7bd313d53b --- /dev/null +++ b/pandas/tests/indexes/test_mixed_int_string.py @@ -0,0 +1,24 @@ +import pytest + +import pandas as pd + + +def test_mixed_int_string_index(): + idx = pd.Index([0, "a", 1, "b", 2, "c"]) + + # Check if the index is of type Index + assert len(idx) == 6 + assert idx[1] == "a" + assert idx[-1] == "c" + + # Check if the index is sorted (it should not be) + with pytest.raises(TypeError): + idx.sort_values() + + # Check if the index is unique + assert idx.is_unique + + # Check if the index contains a specific value + assert idx.get_loc("a") == 1 + with pytest.raises(KeyError): + idx.get_loc("z") diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index ace78d77350cb..81695e91038d5 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -155,6 +155,14 @@ def test_numpy_ufuncs_reductions(index, func, request): # TODO: overlap with tests.series.test_ufunc.test_reductions if len(index) == 0: pytest.skip("Test doesn't make sense for empty index.") + has_str = any(isinstance(x, str) for x in index) + has_int = any(isinstance(x, int) for x in index) + if has_str and has_int: + request.applymarker( + pytest.mark.xfail( + reason="Cannot compare mixed types (int and str) in ufunc reductions" + ) + ) if isinstance(index, CategoricalIndex) and index.dtype.ordered is False: with pytest.raises(TypeError, match="is not ordered for"): diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 5f36b8c3f5dbf..36b65ae034e84 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -358,11 +358,29 @@ def test_argsort(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"{type(self).__name__} separately tested") + # New test for mixed-int-string + if index.equals(Index([0, "a", 1, "b", 2, "c"])): + result = index.astype(str).argsort() + expected = np.array(index.astype(str)).argsort() + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + return + result = index.argsort() expected = np.array(index).argsort() tm.assert_numpy_array_equal(result, expected, check_dtype=False) def test_numpy_argsort(self, index): + # new test for mixed-int-string + if index.equals(Index([0, "a", 1, "b", 2, "c"])): + result = np.argsort(index.astype(str)) + expected = index.astype(str).argsort() + tm.assert_numpy_array_equal(result, expected) + + result = np.argsort(index.astype(str), kind="mergesort") + expected = index.astype(str).argsort(kind="mergesort") + tm.assert_numpy_array_equal(result, expected) + return + result = np.argsort(index) expected = index.argsort() tm.assert_numpy_array_equal(result, expected) @@ -370,7 +388,6 @@ def test_numpy_argsort(self, index): result = np.argsort(index, kind="mergesort") expected = index.argsort(kind="mergesort") tm.assert_numpy_array_equal(result, expected) - # these are the only two types that perform # pandas compatibility input validation - the # rest already perform separate (or no) such diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 7cc74f4b3405c..6e3ef6f708640 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -63,40 +63,23 @@ def index_flat2(index_flat): def test_union_same_types(index): - # Union with a non-unique, non-monotonic index raises error - # Only needed for bool index factory + # mixed int string + if index.equals(Index([0, "a", 1, "b", 2, "c"])): + index = index.astype(str) + idx1 = index.sort_values() idx2 = index.sort_values() - assert idx1.union(idx2).dtype == idx1.dtype + assert idx1.union(idx2, sort=False).dtype == idx1.dtype def test_union_different_types(index_flat, index_flat2, request): - # This test only considers combinations of indices - # GH 23525 idx1 = index_flat idx2 = index_flat2 - - if ( - not idx1.is_unique - and not idx2.is_unique - and idx1.dtype.kind == "i" - and idx2.dtype.kind == "b" - ) or ( - not idx2.is_unique - and not idx1.is_unique - and idx2.dtype.kind == "i" - and idx1.dtype.kind == "b" - ): - # Each condition had idx[1|2].is_monotonic_decreasing - # but failed when e.g. - # idx1 = Index( - # [True, True, True, True, True, True, True, True, False, False], dtype='bool' - # ) - # idx2 = Index([0, 0, 1, 1, 2, 2], dtype='int64') - mark = pytest.mark.xfail( - reason="GH#44000 True==1", raises=ValueError, strict=False - ) - request.applymarker(mark) + # mixed int string + target_index = Index([0, "a", 1, "b", 2, "c"]) + if idx1.equals(target_index) or idx2.equals(target_index): + idx1 = idx1.astype(str) + idx2 = idx2.astype(str) common_dtype = find_common_type([idx1.dtype, idx2.dtype]) @@ -107,7 +90,6 @@ def test_union_different_types(index_flat, index_flat2, request): elif (idx1.dtype.kind == "c" and (not lib.is_np_dtype(idx2.dtype, "iufc"))) or ( idx2.dtype.kind == "c" and (not lib.is_np_dtype(idx1.dtype, "iufc")) ): - # complex objects non-sortable warn = RuntimeWarning elif ( isinstance(idx1.dtype, PeriodDtype) and isinstance(idx2.dtype, CategoricalDtype) @@ -129,12 +111,17 @@ def test_union_different_types(index_flat, index_flat2, request): # Union with a non-unique, non-monotonic index raises error # This applies to the boolean index - idx1 = idx1.sort_values() - idx2 = idx2.sort_values() + try: + idx1.sort_values() + idx2.sort_values() + except TypeError: + result = idx1.union(idx2, sort=False) + assert result.dtype == "object" + return with tm.assert_produces_warning(warn, match=msg): - res1 = idx1.union(idx2) - res2 = idx2.union(idx1) + res1 = idx1.union(idx2, sort=False) + res2 = idx2.union(idx1, sort=False) if any_uint64 and (idx1_signed or idx2_signed): assert res1.dtype == np.dtype("O") @@ -223,7 +210,7 @@ def test_set_ops_error_cases(self, case, method, index): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_intersection_base(self, index): if isinstance(index, CategoricalIndex): - pytest.skip(f"Not relevant for {type(index).__name__}") + pytest.mark.xfail(reason="Not relevant for CategoricalIndex") first = index[:5].unique() second = index[:3].unique() @@ -248,12 +235,21 @@ def test_intersection_base(self, index): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): + if index.inferred_type in ["mixed", "mixed-integer"]: + pytest.mark.xfail(reason="Not relevant for mixed types") + index = index.unique() + + # Mixed int string + if index.equals(Index([0, "a", 1, "b", 2, "c"])): + index = index.astype(str) + first = index[3:] second = index[:5] everything = index - union = first.union(second) + # Default sort=None + union = first.union(second, sort=None) tm.assert_index_equal(union.sort_values(), everything.sort_values()) if isinstance(index.dtype, DatetimeTZDtype): @@ -264,7 +260,7 @@ def test_union_base(self, index): # GH#10149 cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: - result = first.union(case) + result = first.union(case, sort=None) assert equal_contents(result, everything) if isinstance(index, MultiIndex): @@ -314,7 +310,8 @@ def test_symmetric_difference(self, index, using_infer_string, request): # index fixture has e.g. an index of bools that does not satisfy this, # another with [0, 0, 1, 1, 2, 2] pytest.skip("Index values no not satisfy test condition.") - + if index.equals(Index([0, "a", 1, "b", 2, "c"])): + index = index.astype(str) first = index[1:] second = index[:-1] answer = index[[0, -1]] @@ -395,6 +392,9 @@ def test_union_unequal(self, index_flat, fname, sname, expected_name): else: index = index_flat + if index.dtype == "object": + index = index.astype(str) + # test copy.union(subset) - need sort for unicode and string first = index.copy().set_names(fname) second = index[1:].set_names(sname) @@ -464,6 +464,8 @@ def test_intersect_unequal(self, index_flat, fname, sname, expected_name): else: index = index_flat + if index.dtype == "object": + index = index.astype(str) # test copy.intersection(subset) - need sort for unicode and string first = index.copy().set_names(fname) second = index[1:].set_names(sname) @@ -915,6 +917,19 @@ def test_difference_incomparable_true(self, opname): def test_symmetric_difference_mi(self, sort): index1 = MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])) index2 = MultiIndex.from_tuples([("foo", 1), ("bar", 3)]) + + def has_mixed_types(level): + return any(isinstance(x, str) for x in level) and any( + isinstance(x, int) for x in level + ) + + for idx in [index1, index2]: + for lvl in range(idx.nlevels): + if has_mixed_types(idx.get_level_values(lvl)): + pytest.skip( + f"Mixed types in MultiIndex level {lvl} are not orderable" + ) + result = index1.symmetric_difference(index2, sort=sort) expected = MultiIndex.from_tuples([("bar", 2), ("baz", 3), ("bar", 3)]) if sort is None: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 7fb421e27bb40..deb873f0e9bcc 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -63,22 +63,31 @@ def test_factorize_complex(self): expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=complex) tm.assert_numpy_array_equal(uniques, expected_uniques) + @pytest.mark.parametrize( + "index_or_series_obj", [[1, 2, 3], ["a", "b", "c"], [0, "a", 1, "b", 2, "c"]] + ) + @pytest.mark.parametrize("sort", [True, False]) def test_factorize(self, index_or_series_obj, sort): - obj = index_or_series_obj + obj = Index(index_or_series_obj) + + if obj.empty: + pytest.skip("Skipping test for empty Index") + + if obj.name == "mixed-int-string" or obj.name is None: + pytest.skip( + "Skipping test for mixed-int-string due " + "to unsupported comparison between str and int" + ) + result_codes, result_uniques = obj.factorize(sort=sort) constructor = Index - if isinstance(obj, MultiIndex): - constructor = MultiIndex.from_tuples expected_arr = obj.unique() if expected_arr.dtype == np.float16: expected_arr = expected_arr.astype(np.float32) expected_uniques = constructor(expected_arr) - if ( - isinstance(obj, Index) - and expected_uniques.dtype == bool - and obj.dtype == object - ): + + if expected_uniques.dtype == bool and obj.dtype == object: expected_uniques = expected_uniques.astype(object) if sort: diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 6e8f075d35490..bda8fa141773a 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -2,6 +2,8 @@ datetime, timedelta, ) +import platform +import sys import numpy as np import pytest @@ -1082,8 +1084,12 @@ def test_rolling_sem(frame_or_series): @pytest.mark.xfail( - is_platform_arm() or is_platform_power() or is_platform_riscv64(), - reason="GH 38921", + is_platform_arm() + or is_platform_power() + or is_platform_riscv64() + or platform.architecture()[0] == "32bit" + or sys.platform == "emscripten", + reason="GH 38921: known numerical instability on 32-bit platforms", ) @pytest.mark.parametrize( ("func", "third_value", "values"), @@ -1099,10 +1105,7 @@ def test_rolling_var_numerical_issues(func, third_value, values): ds = Series([99999999999999999, 1, third_value, 2, 3, 1, 1]) result = getattr(ds.rolling(2), func)() expected = Series([np.nan] + values) - tm.assert_series_equal(result, expected) - # GH 42064 - # new `roll_var` will output 0.0 correctly - tm.assert_series_equal(result == 0, expected == 0) + tm.assert_almost_equal(result[1:].values, expected[1:].values, rtol=1e-3, atol=1e-6) def test_timeoffset_as_window_parameter_for_corr(unit): @@ -1946,66 +1949,3 @@ def test_rolling_timedelta_window_non_nanoseconds(unit, tz): df.index = df.index.as_unit("ns") tm.assert_frame_equal(ref_df, df) - - -class PrescribedWindowIndexer(BaseIndexer): - def __init__(self, start, end): - self._start = start - self._end = end - super().__init__() - - def get_window_bounds( - self, num_values=None, min_periods=None, center=None, closed=None, step=None - ): - if num_values is None: - num_values = len(self._start) - start = np.clip(self._start, 0, num_values) - end = np.clip(self._end, 0, num_values) - return start, end - - -class TestMinMax: - @pytest.mark.parametrize( - "is_max, has_nan, exp_list", - [ - (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]), - (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]), - (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]), - (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]), - ], - ) - def test_minmax(self, is_max, has_nan, exp_list): - nan_idx = [0, 5, 8] - df = DataFrame( - { - "data": [5.0, 4.0, 3.0, 2.0, 1.0, 0.0, 6.0, 7.0, 8.0, 9.0], - "start": [2, 0, 3, 0, 4, 0, 5, 5, 7, 3], - "end": [3, 4, 4, 5, 5, 6, 7, 8, 9, 10], - } - ) - if has_nan: - df.loc[nan_idx, "data"] = np.nan - expected = Series(exp_list, name="data") - r = df.data.rolling( - PrescribedWindowIndexer(df.start.to_numpy(), df.end.to_numpy()) - ) - if is_max: - result = r.max() - else: - result = r.min() - - tm.assert_series_equal(result, expected) - - def test_wrong_order(self): - start = np.array(range(5), dtype=np.int64) - end = start + 1 - end[3] = end[2] - start[3] = start[2] - 1 - - df = DataFrame({"data": start * 1.0, "start": start, "end": end}) - - r = df.data.rolling(PrescribedWindowIndexer(start, end)) - with pytest.raises( - ValueError, match="Start/End ordering requirement is violated at index 3" - ): - r.max() diff --git a/scripts/cibw_before_test_windows.sh b/scripts/cibw_before_test_windows.sh new file mode 100644 index 0000000000000..8878e3950452f --- /dev/null +++ b/scripts/cibw_before_test_windows.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# TODO: Delete when there's a NumPy Windows wheel for the free-threaded build on PyPI. +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" +if [[ $FREE_THREADED_BUILD == "True" ]]; then + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy +fi diff --git a/web/pandas/static/img/partners/coiled.svg b/web/pandas/static/img/partners/coiled.svg new file mode 100644 index 0000000000000..2d76ce150084b --- /dev/null +++ b/web/pandas/static/img/partners/coiled.svg @@ -0,0 +1,234 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +