Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST (string dtype): clean-up xpasssing tests with future string dtype #59323

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions pandas/tests/arithmetic/test_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -303,7 +301,6 @@ def test_iadd_string(self):
index += "_x"
assert "a_x" in index

@pytest.mark.xfail(using_string_dtype(), reason="add doesn't work")
def test_add(self):
index = pd.Index([str(i) for i in range(10)])
expected = pd.Index(index.values * 2)
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/base/test_unique.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
import pandas._testing as tm
from pandas.tests.base.common import allow_na_ops
Expand Down Expand Up @@ -100,12 +98,11 @@ def test_nunique_null(null_obj, index_or_series_obj):


@pytest.mark.single_cpu
@pytest.mark.xfail(using_string_dtype(), reason="decoding fails")
def test_unique_bad_unicode(index_or_series):
# regression test for #34550
uval = "\ud83d" # smiley emoji

obj = index_or_series([uval] * 2)
obj = index_or_series([uval] * 2, dtype=object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the solution here to add dtype=object? Shouldn't this just work naturally with the inferred string type?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't work with a string dtype because this test is about "bad unicode". And an actual string dtype cannot represent invalid unicode (at least when using pyarrow under the hood. I assume that our object-dtype based one will be able to hold it).

To keep the spirit of the test (ensure our unique implementation can work with bad unicode in object dtype), I made it explicitly used object dtype.

See also the "Invalid unicode input" section in #59328 (that issue I started yesterday to start record breaking changes / things that are no longer supported with the string dtype)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense - also makes me think how we can leverage a BinaryDtype in the future, though that is a different topic for a different day

result = obj.unique()

if isinstance(obj, pd.Index):
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/frame/constructors/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_constructor_single_row(self):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(using_string_dtype(), reason="columns inferring logic broken")
@pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
def test_constructor_list_of_series(self):
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
Expand Down Expand Up @@ -108,6 +108,7 @@ def test_constructor_list_of_series(self):
expected = DataFrame.from_dict(sdict, orient="index")
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
def test_constructor_orient(self, float_string_frame):
data_dict = float_string_frame.T._series
recons = DataFrame.from_dict(data_dict, orient="index")
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,7 @@ def test_from_records_with_datetimes(self):
expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]")
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(
using_string_dtype(), reason="dtype checking logic doesn't work"
)
@pytest.mark.xfail(using_string_dtype(), reason="dtype checking logic doesn't work")
def test_from_records_sequencelike(self):
df = DataFrame(
{
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def test_fillna_datetime(self, datetime_frame):
with pytest.raises(TypeError, match=msg):
datetime_frame.fillna()

# TODO(infer_string) test as actual error instead of xfail
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
def test_fillna_mixed_type(self, float_string_frame):
mf = float_string_frame
Expand Down Expand Up @@ -537,6 +538,7 @@ def test_fillna_col_reordering(self):
filled = df.ffill()
assert df.columns.tolist() == filled.columns.tolist()

# TODO(infer_string) test as actual error instead of xfail
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
def test_fill_corner(self, float_frame, float_string_frame):
mf = float_string_frame
Expand Down
11 changes: 6 additions & 5 deletions pandas/tests/frame/methods/test_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pandas import (
CategoricalIndex,
DataFrame,
Index,
MultiIndex,
Series,
date_range,
Expand Down Expand Up @@ -360,7 +361,7 @@ def test_info_memory_usage():
df = DataFrame(data)
df.columns = dtypes

df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment on all of these - might be overlooking something simple but unsure why dtype=object is the solution

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because what we are testing here is that if you have object dtype, the full memory usage is not known (and you get this "+"):

In [1]: df = DataFrame({"a": ["a", "b"]})

In [2]: df.info()
<class 'pandas.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       2 non-null      object
dtypes: object(1)
memory usage: 148.0+ bytes

In [3]: pd.options.future.infer_string = True

In [4]: df = DataFrame({"a": ["a", "b"]})

In [5]: df.info()
<class 'pandas.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       2 non-null      string
dtypes: string(1)
memory usage: 150.0 bytes

So 148.0+ bytes vs 150.0 bytes.

Of course I could also update the expected result to be accurate instead of an estimate, but that would 1) complicate the test (since we still have to account for both current and future behaviour), and 2) we still need to test the case of object dtype explicitly anyway.

We should probably add a test specifically for string dtype, though, where we can assert that if you have a proper string dtype the memory is now always the full number and not a lower estimate.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool - yea would be nice to add a test for exact memory representation with the StringDtype + pyarrow, though can be done separately

df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])
Expand Down Expand Up @@ -398,25 +399,25 @@ def test_info_memory_usage():

@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy():
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object))
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
> df_with_object_index.memory_usage(index=True).sum()
)

df_object = DataFrame({"a": ["a"]})
df_object = DataFrame({"a": Series(["a"], dtype=object)})
assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()


@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy():
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
df_with_object_index = DataFrame({"a": [1]}, index=Index(["foo"], dtype=object))
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
== df_with_object_index.memory_usage(index=True).sum()
)

df_object = DataFrame({"a": ["a"]})
df_object = DataFrame({"a": Series(["a"], dtype=object)})
assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()


Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/methods/test_interpolate.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def test_interpolate_inplace(self, frame_or_series, request):
assert np.shares_memory(orig, obj.values)
assert orig.squeeze()[1] == 1.5

# TODO(infer_string) raise proper TypeError in case of string dtype
@pytest.mark.xfail(
using_string_dtype(), reason="interpolate doesn't work for string"
)
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -251,7 +249,6 @@ def test_timestamp_compare(self, left, right):
with pytest.raises(TypeError, match=msg):
right_f(pd.Timestamp("nat"), df)

@pytest.mark.xfail(using_string_dtype(), reason="can't compare string and int")
def test_mixed_comparison(self):
# GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
# not raise TypeError
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/indexes/interval/test_formats.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
DatetimeIndex,
Expand Down Expand Up @@ -42,12 +40,11 @@ def test_repr_missing(self, constructor, expected, using_infer_string, request):
result = repr(obj)
assert result == expected

@pytest.mark.xfail(using_string_dtype(), reason="repr different")
def test_repr_floats(self):
# GH 32553

markers = Series(
["foo", "bar"],
[1, 2],
index=IntervalIndex(
[
Interval(left, right)
Expand All @@ -59,7 +56,7 @@ def test_repr_floats(self):
),
)
result = str(markers)
expected = "(329.973, 345.137] foo\n(345.137, 360.191] bar\ndtype: object"
expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64"
assert result == expected

@pytest.mark.parametrize(
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import (
IS64,
is_platform_windows,
Expand Down Expand Up @@ -825,8 +823,6 @@ def replacer(self, how, from_key, to_key):
raise ValueError
return replacer

# Expected needs adjustment for the infer string option, seems to work as expecetd
@pytest.mark.skipif(using_string_dtype(), reason="TODO: test is to complex")
def test_replace_series(self, how, to_key, from_key, replacer):
index = pd.Index([3, 4], name="xxx")
obj = pd.Series(self.rep[from_key], index=index, name="yyy")
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import IndexingError

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -426,7 +424,6 @@ def test_set_index_nan(self):
)
tm.assert_frame_equal(result, df)

@pytest.mark.xfail(using_string_dtype(), reason="can't multiply arrow strings")
def test_multi_assign(self):
# GH 3626, an assignment of a sub-df to a df
# set float64 to avoid upcast when setting nan
Expand Down Expand Up @@ -652,7 +649,6 @@ def test_loc_setitem_fullindex_views(self):
df.loc[df.index] = df.loc[df.index]
tm.assert_frame_equal(df, df2)

@pytest.mark.xfail(using_string_dtype(), reason="can't set int into string")
def test_rhs_alignment(self):
# GH8258, tests that both rows & columns are aligned to what is
# assigned to. covers both uniform data-type & multi-type cases
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/series/methods/test_reindex.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
NA,
Categorical,
Expand All @@ -22,7 +20,6 @@
import pandas._testing as tm


@pytest.mark.xfail(using_string_dtype(), reason="share memory doesn't work for arrow")
def test_reindex(datetime_series, string_series):
identity = string_series.reindex(string_series.index)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def test_tidy_repr_name_0(self, arg):
assert "Name: 0" in rep_str

@pytest.mark.xfail(
using_string_dtype(), reason="TODO: investigate why this is failing"
using_string_dtype(), reason="TODO(infer_string): investigate failure"
)
def test_newline(self):
ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"])
Expand Down
Loading