Skip to content

TST: Testing for mixed int/str Index #61349

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 46 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
29039f9
Added requirements.txt with project dependencies
pelagiavlas Mar 26, 2025
65de448
ValueError in pytest parametrization due to direct Index object evalu…
pelagiavlas Apr 7, 2025
0816a26
BUG: Fix TypeError in set operations with mixed int/string indexes
pelagiavlas Apr 7, 2025
b87036f
BUG: Handle mixed int/str types in Index.union
pelagiavlas Apr 7, 2025
946f99b
BUG: Fix value_counts() with mixed int/str indexes containing nulls
pelagiavlas Apr 7, 2025
a671eb7
Merge branch 'main' of https://github.com/pandas-dev/pandas
xaris96 Apr 9, 2025
416a6ae
Merge branch 'main' of https://github.com/pandas-dev/pandas
xaris96 Apr 9, 2025
2e63667
BUG: Ignore mixed-type comparison warning in tests
pelagiavlas Apr 10, 2025
5550b1d
BUG: Apply xfail to handle unsupported int/str comparison in test_sor…
pelagiavlas Apr 10, 2025
33e2a34
BUG: Apply xfail to handle unsupported int/str comparison in test_sor…
pelagiavlas Apr 10, 2025
d7b534e
BUG: Mark test_numpy_ufuncs_reductions as xfail for mixed int/str index
pelagiavlas Apr 10, 2025
642734e
BUG: Avoid mixed-type Index in argsort test to prevent sorting errors
pelagiavlas Apr 14, 2025
dea15de
BUG: Skip argsort tests for mixed-type Index to avoid TypeError
pelagiavlas Apr 14, 2025
03c3b0a
TST: Add skip for tests using mixed-type Index
pelagiavlas Apr 14, 2025
5d1c154
one new test just for the mixed string in indices_dict (pandas\confet…
xaris96 Apr 21, 2025
c10c263
log files
xaris96 Apr 23, 2025
edb84e4
fixed test_union_duplicates[mixed-int-string] test fail in tests\inde…
xaris96 Apr 23, 2025
af140a8
2 test passed for mixed int string
xaris96 Apr 23, 2025
8992100
test_union_same_type mixed int string
xaris96 Apr 24, 2025
1fe92f9
test_union_different_types mixed int string fixed
xaris96 Apr 24, 2025
599df6d
test_union_base mixed int string test fail fixed
xaris96 Apr 24, 2025
3256953
total 5 tests fixed and 2 made xfailed
xaris96 Apr 24, 2025
bf05b29
Merge branch 'issue-TM' into vol2
xaris96 Apr 24, 2025
c856799
all tests passed!
xaris96 Apr 24, 2025
ed90c56
merged
xaris96 Apr 24, 2025
e3f1eb2
changes
xaris96 Apr 24, 2025
a784a90
log files deleted
xaris96 May 5, 2025
eb2f210
Fix trailing whitespace in test_mixed_int_string.py
xaris96 May 7, 2025
710e4d5
changes for pre-commit.ci
xaris96 May 7, 2025
079aeb1
pre-commit run --all-files changes
xaris96 May 7, 2025
545f04c
lines too long
xaris96 May 7, 2025
a16f5b3
mark x fail and some tests fixed
xaris96 May 8, 2025
413dad1
new
xaris96 May 8, 2025
a6b958b
pd fixed
xaris96 May 8, 2025
0c0ef09
test passed
xaris96 May 8, 2025
d3a2378
mark.xfail instead of skip in test_setops
xaris96 May 8, 2025
355a058
test_misc
xaris96 May 10, 2025
a2d5fbf
done
xaris96 May 10, 2025
ec189e4
better approach for mixed int, 1 more test passed
xaris96 May 10, 2025
771c098
mark x fail
xaris96 May 10, 2025
25ba609
Merge branch 'main' into issue-TM
xaris96 May 10, 2025
acd31b1
Trigger CI rerun
xaris96 May 11, 2025
b522022
test rolling change
xaris96 May 11, 2025
64bf3fe
test rolling
xaris96 May 11, 2025
4c4e673
new change
xaris96 May 11, 2025
96c26a3
pre commit checks done
xaris96 May 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,7 @@ def _create_mi_with_dt64tz_level():
"string-python": Index(
pd.array([f"pandas_{i}" for i in range(10)], dtype="string[python]")
),
"mixed-int-string": Index([0, "a", 1, "b", 2, "c"]),
}
if has_pyarrow:
idx = Index(pd.array([f"pandas_{i}" for i in range(10)], dtype="string[pyarrow]"))
Expand Down
34 changes: 22 additions & 12 deletions pandas/tests/base/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,19 +147,29 @@ def test_searchsorted(request, index_or_series_obj):
# See gh-12238
obj = index_or_series_obj

# 1. Check for multi-index
if isinstance(obj, pd.MultiIndex):
# See gh-14833
request.applymarker(
pytest.mark.xfail(
reason="np.searchsorted doesn't work on pd.MultiIndex: GH 14833"
)
)
elif obj.dtype.kind == "c" and isinstance(obj, Index):
# TODO: Should Series cases also raise? Looks like they use numpy
# comparison semantics https://github.com/numpy/numpy/issues/15981
mark = pytest.mark.xfail(reason="complex objects are not comparable")
request.applymarker(mark)

request.applymarker(pytest.mark.xfail(reason="GH 14833", strict=False))
return

# 2. Check for Index and subtypes
if isinstance(obj, Index):
# 2a. Mixed types
if obj.inferred_type in ["mixed", "mixed-integer"]:
try:
obj = obj.astype(str)
except (TypeError, ValueError):
request.applymarker(
pytest.mark.xfail(reason="Mixed types", strict=False)
)
return

# 2b. Complex types
elif obj.dtype.kind == "c":
request.applymarker(pytest.mark.xfail(reason="Complex types", strict=False))
return

# 3. Run test ONLY if there isn't mixed/complex types
max_obj = max(obj, default=0)
index = np.searchsorted(obj, max_obj)
assert 0 <= index <= len(obj)
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def test_value_counts_null(null_obj, index_or_series_obj):
elif isinstance(orig, MultiIndex):
pytest.skip(f"MultiIndex can't hold '{null_obj}'")

if obj.dtype == "object":
obj = obj.astype(str)

values = obj._values
values[0:2] = null_obj

Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,11 +626,16 @@ def test_union_with_duplicates_keep_ea_dtype(dupe_val, any_numeric_ea_dtype):

@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_union_duplicates(index, request):
# special case for mixed types
if index.equals(Index([0, "a", 1, "b", 2, "c"])):
index = index.map(str)

# GH#38977
if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)):
pytest.skip(f"No duplicates in an empty {type(index).__name__}")

values = index.unique().values.tolist()
values = [str(v) for v in values]
mi1 = MultiIndex.from_arrays([values, [1] * len(values)])
mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)])
result = mi2.union(mi1)
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/indexes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,9 @@ def test_hasnans_isnans(self, index_flat):
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
@pytest.mark.parametrize("na_position", [None, "middle"])
def test_sort_values_invalid_na_position(index_with_missing, na_position):
if len({type(x) for x in index_with_missing if pd.notna(x)}) > 1:
index_with_missing = index_with_missing.map(str)

with pytest.raises(ValueError, match=f"invalid na_position: {na_position}"):
index_with_missing.sort_values(na_position=na_position)

Expand All @@ -450,6 +453,10 @@ def test_sort_values_with_missing(index_with_missing, na_position, request):
# GH 35584. Test that sort_values works with missing values,
# sort non-missing and place missing according to na_position

non_na_values = [x for x in index_with_missing if pd.notna(x)]
if len({type(x) for x in non_na_values}) > 1:
index_with_missing = index_with_missing.map(str)

if isinstance(index_with_missing, CategoricalIndex):
request.applymarker(
pytest.mark.xfail(
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/indexes/test_mixed_int_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pytest

import pandas as pd


def test_mixed_int_string_index():
idx = pd.Index([0, "a", 1, "b", 2, "c"])

# Check if the index is of type Index
assert len(idx) == 6
assert idx[1] == "a"
assert idx[-1] == "c"

# Check if the index is sorted (it should not be)
with pytest.raises(TypeError):
idx.sort_values()

# Check if the index is unique
assert idx.is_unique

# Check if the index contains a specific value
assert idx.get_loc("a") == 1
with pytest.raises(KeyError):
idx.get_loc("z")
8 changes: 8 additions & 0 deletions pandas/tests/indexes/test_numpy_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,14 @@ def test_numpy_ufuncs_reductions(index, func, request):
# TODO: overlap with tests.series.test_ufunc.test_reductions
if len(index) == 0:
pytest.skip("Test doesn't make sense for empty index.")
has_str = any(isinstance(x, str) for x in index)
has_int = any(isinstance(x, int) for x in index)
if has_str and has_int:
request.applymarker(
pytest.mark.xfail(
reason="Cannot compare mixed types (int and str) in ufunc reductions"
)
)

if isinstance(index, CategoricalIndex) and index.dtype.ordered is False:
with pytest.raises(TypeError, match="is not ordered for"):
Expand Down
19 changes: 18 additions & 1 deletion pandas/tests/indexes/test_old_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,19 +358,36 @@ def test_argsort(self, index):
if isinstance(index, CategoricalIndex):
pytest.skip(f"{type(self).__name__} separately tested")

# New test for mixed-int-string
if index.equals(Index([0, "a", 1, "b", 2, "c"])):
result = index.astype(str).argsort()
expected = np.array(index.astype(str)).argsort()
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
return

result = index.argsort()
expected = np.array(index).argsort()
tm.assert_numpy_array_equal(result, expected, check_dtype=False)

def test_numpy_argsort(self, index):
# new test for mixed-int-string
if index.equals(Index([0, "a", 1, "b", 2, "c"])):
result = np.argsort(index.astype(str))
expected = index.astype(str).argsort()
tm.assert_numpy_array_equal(result, expected)

result = np.argsort(index.astype(str), kind="mergesort")
expected = index.astype(str).argsort(kind="mergesort")
tm.assert_numpy_array_equal(result, expected)
return

result = np.argsort(index)
expected = index.argsort()
tm.assert_numpy_array_equal(result, expected)

result = np.argsort(index, kind="mergesort")
expected = index.argsort(kind="mergesort")
tm.assert_numpy_array_equal(result, expected)

# these are the only two types that perform
# pandas compatibility input validation - the
# rest already perform separate (or no) such
Expand Down
87 changes: 51 additions & 36 deletions pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,40 +63,23 @@ def index_flat2(index_flat):


def test_union_same_types(index):
# Union with a non-unique, non-monotonic index raises error
# Only needed for bool index factory
# mixed int string
if index.equals(Index([0, "a", 1, "b", 2, "c"])):
index = index.astype(str)

idx1 = index.sort_values()
idx2 = index.sort_values()
assert idx1.union(idx2).dtype == idx1.dtype
assert idx1.union(idx2, sort=False).dtype == idx1.dtype


def test_union_different_types(index_flat, index_flat2, request):
# This test only considers combinations of indices
# GH 23525
idx1 = index_flat
idx2 = index_flat2

if (
not idx1.is_unique
and not idx2.is_unique
and idx1.dtype.kind == "i"
and idx2.dtype.kind == "b"
) or (
not idx2.is_unique
and not idx1.is_unique
and idx2.dtype.kind == "i"
and idx1.dtype.kind == "b"
):
# Each condition had idx[1|2].is_monotonic_decreasing
# but failed when e.g.
# idx1 = Index(
# [True, True, True, True, True, True, True, True, False, False], dtype='bool'
# )
# idx2 = Index([0, 0, 1, 1, 2, 2], dtype='int64')
mark = pytest.mark.xfail(
reason="GH#44000 True==1", raises=ValueError, strict=False
)
request.applymarker(mark)
# mixed int string
target_index = Index([0, "a", 1, "b", 2, "c"])
if idx1.equals(target_index) or idx2.equals(target_index):
idx1 = idx1.astype(str)
idx2 = idx2.astype(str)

common_dtype = find_common_type([idx1.dtype, idx2.dtype])

Expand All @@ -107,7 +90,6 @@ def test_union_different_types(index_flat, index_flat2, request):
elif (idx1.dtype.kind == "c" and (not lib.is_np_dtype(idx2.dtype, "iufc"))) or (
idx2.dtype.kind == "c" and (not lib.is_np_dtype(idx1.dtype, "iufc"))
):
# complex objects non-sortable
warn = RuntimeWarning
elif (
isinstance(idx1.dtype, PeriodDtype) and isinstance(idx2.dtype, CategoricalDtype)
Expand All @@ -129,12 +111,17 @@ def test_union_different_types(index_flat, index_flat2, request):

# Union with a non-unique, non-monotonic index raises error
# This applies to the boolean index
idx1 = idx1.sort_values()
idx2 = idx2.sort_values()
try:
idx1.sort_values()
idx2.sort_values()
except TypeError:
result = idx1.union(idx2, sort=False)
assert result.dtype == "object"
return

with tm.assert_produces_warning(warn, match=msg):
res1 = idx1.union(idx2)
res2 = idx2.union(idx1)
res1 = idx1.union(idx2, sort=False)
res2 = idx2.union(idx1, sort=False)

if any_uint64 and (idx1_signed or idx2_signed):
assert res1.dtype == np.dtype("O")
Expand Down Expand Up @@ -223,7 +210,7 @@ def test_set_ops_error_cases(self, case, method, index):
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_intersection_base(self, index):
if isinstance(index, CategoricalIndex):
pytest.skip(f"Not relevant for {type(index).__name__}")
pytest.mark.xfail(reason="Not relevant for CategoricalIndex")

first = index[:5].unique()
second = index[:3].unique()
Expand All @@ -248,12 +235,21 @@ def test_intersection_base(self, index):

@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_union_base(self, index):
if index.inferred_type in ["mixed", "mixed-integer"]:
pytest.mark.xfail(reason="Not relevant for mixed types")

index = index.unique()

# Mixed int string
if index.equals(Index([0, "a", 1, "b", 2, "c"])):
index = index.astype(str)

first = index[3:]
second = index[:5]
everything = index

union = first.union(second)
# Default sort=None
union = first.union(second, sort=None)
tm.assert_index_equal(union.sort_values(), everything.sort_values())

if isinstance(index.dtype, DatetimeTZDtype):
Expand All @@ -264,7 +260,7 @@ def test_union_base(self, index):
# GH#10149
cases = [second.to_numpy(), second.to_series(), second.to_list()]
for case in cases:
result = first.union(case)
result = first.union(case, sort=None)
assert equal_contents(result, everything)

if isinstance(index, MultiIndex):
Expand Down Expand Up @@ -314,7 +310,8 @@ def test_symmetric_difference(self, index, using_infer_string, request):
# index fixture has e.g. an index of bools that does not satisfy this,
# another with [0, 0, 1, 1, 2, 2]
pytest.skip("Index values no not satisfy test condition.")

if index.equals(Index([0, "a", 1, "b", 2, "c"])):
index = index.astype(str)
first = index[1:]
second = index[:-1]
answer = index[[0, -1]]
Expand Down Expand Up @@ -395,6 +392,9 @@ def test_union_unequal(self, index_flat, fname, sname, expected_name):
else:
index = index_flat

if index.dtype == "object":
index = index.astype(str)

# test copy.union(subset) - need sort for unicode and string
first = index.copy().set_names(fname)
second = index[1:].set_names(sname)
Expand Down Expand Up @@ -464,6 +464,8 @@ def test_intersect_unequal(self, index_flat, fname, sname, expected_name):
else:
index = index_flat

if index.dtype == "object":
index = index.astype(str)
# test copy.intersection(subset) - need sort for unicode and string
first = index.copy().set_names(fname)
second = index[1:].set_names(sname)
Expand Down Expand Up @@ -915,6 +917,19 @@ def test_difference_incomparable_true(self, opname):
def test_symmetric_difference_mi(self, sort):
index1 = MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3]))
index2 = MultiIndex.from_tuples([("foo", 1), ("bar", 3)])

def has_mixed_types(level):
return any(isinstance(x, str) for x in level) and any(
isinstance(x, int) for x in level
)

for idx in [index1, index2]:
for lvl in range(idx.nlevels):
if has_mixed_types(idx.get_level_values(lvl)):
pytest.skip(
f"Mixed types in MultiIndex level {lvl} are not orderable"
)

result = index1.symmetric_difference(index2, sort=sort)
expected = MultiIndex.from_tuples([("bar", 2), ("baz", 3), ("bar", 3)])
if sort is None:
Expand Down
25 changes: 17 additions & 8 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,22 +63,31 @@ def test_factorize_complex(self):
expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=complex)
tm.assert_numpy_array_equal(uniques, expected_uniques)

@pytest.mark.parametrize(
"index_or_series_obj", [[1, 2, 3], ["a", "b", "c"], [0, "a", 1, "b", 2, "c"]]
)
@pytest.mark.parametrize("sort", [True, False])
def test_factorize(self, index_or_series_obj, sort):
obj = index_or_series_obj
obj = Index(index_or_series_obj)

if obj.empty:
pytest.skip("Skipping test for empty Index")

if obj.name == "mixed-int-string" or obj.name is None:
pytest.skip(
"Skipping test for mixed-int-string due "
"to unsupported comparison between str and int"
)

result_codes, result_uniques = obj.factorize(sort=sort)

constructor = Index
if isinstance(obj, MultiIndex):
constructor = MultiIndex.from_tuples
expected_arr = obj.unique()
if expected_arr.dtype == np.float16:
expected_arr = expected_arr.astype(np.float32)
expected_uniques = constructor(expected_arr)
if (
isinstance(obj, Index)
and expected_uniques.dtype == bool
and obj.dtype == object
):

if expected_uniques.dtype == bool and obj.dtype == object:
expected_uniques = expected_uniques.astype(object)

if sort:
Expand Down
Loading
Loading