From b39022ae4b96a4a83300551c0b61008dd75fa9eb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:43:25 -0700 Subject: [PATCH 1/2] Fix DataFrame from Series with different CategoricalIndexes --- python/cudf/cudf/core/indexed_frame.py | 7 +++++++ python/cudf/cudf/tests/test_dataframe.py | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 62e091b29b5..aacf1fa8dae 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -5438,6 +5438,13 @@ def _is_same_dtype(lhs_dtype, rhs_dtype): # for matching column dtype. if lhs_dtype == rhs_dtype: return True + elif ( + is_categorical_dtype(lhs_dtype) + and is_categorical_dtype(rhs_dtype) + and lhs_dtype.categories.dtype == rhs_dtype.categories.dtype + ): + # OK if categories are not all the same + return True elif ( is_categorical_dtype(lhs_dtype) and not is_categorical_dtype(rhs_dtype) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6180162ecdd..61c3c4deffd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10389,6 +10389,19 @@ def test_dataframe_init_from_nested_dict(): assert_eq(pdf, gdf) +def test_init_from_2_categoricalindex_series_diff_categories(): + s1 = cudf.Series( + [39, 6, 4], index=cudf.CategoricalIndex(["female", "male", "unknown"]) + ) + s2 = cudf.Series( + [2, 152, 2, 242, 150], + index=cudf.CategoricalIndex(["f", "female", "m", "male", "unknown"]), + ) + result = cudf.DataFrame([s1, s2]) + expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()]) + assert_eq(result, expected) + + def test_data_frame_values_no_cols_but_index(): result = cudf.DataFrame(index=range(5)).values expected = pd.DataFrame(index=range(5)).values From a5bb2594a45127dccb4faf96c50bcdb2d58e0fcb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 22 Sep 2023 10:29:37 -0700 Subject: [PATCH 2/2] check_dtype=False --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6dce2990851..67b63028fab 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10418,7 +10418,7 @@ def test_init_from_2_categoricalindex_series_diff_categories(): ) result = cudf.DataFrame([s1, s2]) expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()]) - assert_eq(result, expected) + assert_eq(result, expected, check_dtype=False) def test_data_frame_values_no_cols_but_index():