From f8b54d7b3d2d79304688ca31ed30a29245ad9fdd Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 27 Jun 2024 04:26:40 +0000 Subject: [PATCH 1/3] Fix errors in String Arrays --- python/cudf/cudf/pandas/_wrappers/pandas.py | 20 +++++++++++++++++++ .../cudf/pandas/scripts/run-pandas-tests.sh | 4 +++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 0ba432d6d0e..9c26006bb25 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -522,6 +522,26 @@ def Index__new__(cls, *args, **kwargs): }, ) +ArrowStringArrayNumpySemantics = make_final_proxy_type( + "ArrowStringArrayNumpySemantics", + _Unusable, + pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics, + fast_to_slow=_Unusable(), + slow_to_fast=_Unusable(), + # additional_attributes={ + # "_data": _FastSlowAttribute("_data", private=True), + # "_mask": _FastSlowAttribute("_mask", private=True), + # }, +) + +ArrowStringArray = make_final_proxy_type( + "ArrowStringArray", + _Unusable, + pd.core.arrays.string_arrow.ArrowStringArray, + fast_to_slow=_Unusable(), + slow_to_fast=_Unusable(), +) + StringDtype = make_final_proxy_type( "StringDtype", _Unusable, diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index cd9f90d50fe..60d4b1f05c1 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -25,6 +25,7 @@ PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)") # tests/io/test_clipboard.py::TestClipboard crashes pytest workers (possibly due to fixture patching clipboard functionality) PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py \ --ignore=tests/io/test_clipboard.py" +# --ignore=tests/arrays/test_datetimelike.py " mkdir -p pandas-testing cd pandas-testing @@ -133,7 +134,8 @@ and not test_s3_roundtrip" TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \ and not test_large_string_pyarrow \ and not test_interchange_from_corrected_buffer_dtypes \ -and not test_eof_states" +and not test_eof_states \ +and not test_array_tz" # TODO: Remove "not db" once a postgres & mysql container is set up on the CI PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \ From e6e3960e6217a058b3993f6b17b66c2b97d7755b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 27 Jun 2024 13:36:13 +0000 Subject: [PATCH 2/3] add tests --- python/cudf/cudf/pandas/_wrappers/pandas.py | 4 ---- .../cudf_pandas_tests/test_cudf_pandas.py | 23 +++++++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 9c26006bb25..a64bf7772fe 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -528,10 +528,6 @@ def Index__new__(cls, *args, **kwargs): pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics, fast_to_slow=_Unusable(), slow_to_fast=_Unusable(), - # additional_attributes={ - # "_data": _FastSlowAttribute("_data", private=True), - # "_mask": _FastSlowAttribute("_mask", private=True), - # }, ) ArrowStringArray = make_final_proxy_type( diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index eed5037cbea..0d46e2e9311 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1533,3 +1533,26 @@ def test_is_proxy_object(): assert is_proxy_object(np_arr_proxy) assert is_proxy_object(s1) assert not is_proxy_object(s2) + + +def test_arrow_string_arrays(): + cu_s = xpd.Series(["a", "b", "c"]) + pd_s = pd.Series(["a", "b", "c"]) + + cu_arr = xpd.arrays.ArrowStringArray._from_sequence( + cu_s, dtype=xpd.StringDtype("pyarrow") + ) + pd_arr = pd.arrays.ArrowStringArray._from_sequence( + pd_s, dtype=pd.StringDtype("pyarrow") + ) + + tm.assert_equal(cu_arr, pd_arr) + + cu_arr = xpd.core.arrays.string_arrow.ArrowStringArray._from_sequence( + cu_s, dtype=xpd.StringDtype("pyarrow_numpy") + ) + pd_arr = pd.core.arrays.string_arrow.ArrowStringArray._from_sequence( + pd_s, dtype=pd.StringDtype("pyarrow_numpy") + ) + + tm.assert_equal(cu_arr, pd_arr) From a0e9da40eff655c34f8d77d274bfe58f31da0aaa Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 27 Jun 2024 13:37:22 +0000 Subject: [PATCH 3/3] cleanup --- python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 60d4b1f05c1..a66f63c09b3 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -25,7 +25,6 @@ PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)") # tests/io/test_clipboard.py::TestClipboard crashes pytest workers (possibly due to fixture patching clipboard functionality) PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py \ --ignore=tests/io/test_clipboard.py" -# --ignore=tests/arrays/test_datetimelike.py " mkdir -p pandas-testing cd pandas-testing