From fa8284ddb2de808573d5b21cc9e650578ddf6acc Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 27 Jun 2024 11:51:41 +0100 Subject: [PATCH] Adapt to polars upstream changes and turn on CI testing (#16081) They changed the semantics of join keys when those keys are expressions to more closely match SQL. Dtype inference is also tighter, so update tests to adapt to those changes, and some other small deprecation warnings. Finish the final missing coverage piece and turn on testing in CI (failing if we don't hit 100% coverage as well). Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Thomas Li (https://github.com/lithomas1) - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/16081 --- .github/workflows/pr.yaml | 12 +++ ci/test_cudf_polars.sh | 68 +++++++++++++++++ python/cudf_polars/cudf_polars/dsl/ir.py | 70 +++++++++--------- .../cudf_polars/typing/__init__.py | 74 ++++++++++--------- .../cudf_polars/tests/expressions/test_agg.py | 4 +- .../tests/expressions/test_booleanfunction.py | 12 ++- .../tests/expressions/test_rolling.py | 12 ++- .../tests/expressions/test_stringfunction.py | 16 ++-- python/cudf_polars/tests/test_groupby.py | 11 +-- python/cudf_polars/tests/test_join.py | 16 +++- python/cudf_polars/tests/test_mapfunction.py | 32 ++++++-- python/cudf_polars/tests/test_python_scan.py | 7 +- python/cudf_polars/tests/test_union.py | 12 +-- 13 files changed, 234 insertions(+), 112 deletions(-) create mode 100755 ci/test_cudf_polars.sh diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index cb582df21e0..a35802f2ab0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -25,6 +25,7 @@ jobs: - docs-build - wheel-build-cudf - wheel-tests-cudf + - test-cudf-polars - wheel-build-dask-cudf - wheel-tests-dask-cudf - devcontainer @@ -132,6 +133,17 @@ jobs: with: build_type: pull-request script: ci/test_wheel_cudf.sh + test-cudf-polars: + needs: wheel-build-cudf + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08 + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + # This always runs, but only fails if this PR touches code in + # pylibcudf or cudf_polars + script: "ci/test_cudf_polars.sh" wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit diff --git a/ci/test_cudf_polars.sh b/ci/test_cudf_polars.sh new file mode 100755 index 00000000000..669e049ab26 --- /dev/null +++ b/ci/test_cudf_polars.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -eou pipefail + +# We will only fail these tests if the PR touches code in pylibcudf +# or cudf_polars itself. +# Note, the three dots mean we are doing diff between the merge-base +# of upstream and HEAD. So this is asking, "does _this branch_ touch +# files in cudf_polars/pylibcudf", rather than "are there changes +# between upstream and this branch which touch cudf_polars/pylibcudf" +# TODO: is the target branch exposed anywhere in an environment variable? +if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ]; +then + HAS_CHANGES=1 +else + HAS_CHANGES=0 +fi + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist + +RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ +mkdir -p "${RAPIDS_TESTS_DIR}" + +rapids-logger "Install cudf wheel" +# echo to expand wildcard before adding `[extra]` requires for pip +python -m pip install $(echo ./dist/cudf*.whl)[test] + +rapids-logger "Install polars (allow pre-release versions)" +python -m pip install 'polars>=1.0.0a0' + +rapids-logger "Install cudf_polars" +python -m pip install --no-deps python/cudf_polars + +rapids-logger "Run cudf_polars tests" + +function set_exitcode() +{ + EXITCODE=$? +} +EXITCODE=0 +trap set_exitcode ERR +set +e + +python -m pytest \ + --cache-clear \ + --cov cudf_polars \ + --cov-fail-under=100 \ + --cov-config=python/cudf_polars/pyproject.toml \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \ + python/cudf_polars/tests + +trap ERR +set -e + +if [ ${EXITCODE} != 0 ]; then + rapids-logger "Testing FAILED: exitcode ${EXITCODE}" +else + rapids-logger "Testing PASSED" +fi + +if [ ${HAS_CHANGES} == 1 ]; then + exit ${EXITCODE} +else + exit 0 +fi diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 4ad6e75fb2e..3f5f3c74050 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -123,7 +123,7 @@ def broadcast( ] -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class IR: """Abstract plan node, representing an unevaluated dataframe.""" @@ -157,7 +157,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) # pragma: no cover -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class PythonScan(IR): """Representation of input from a python function.""" @@ -171,7 +171,7 @@ def __post_init__(self): raise NotImplementedError("PythonScan not implemented") -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Scan(IR): """Input from files.""" @@ -248,7 +248,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return df.filter(mask) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Cache(IR): """ Return a cached plan node. @@ -269,7 +269,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return cache.setdefault(self.key, self.value.evaluate(cache=cache)) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class DataFrameScan(IR): """ Input from an existing polars DataFrame. @@ -315,7 +315,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return df -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Select(IR): """Produce a new dataframe selecting given expressions from an input.""" @@ -336,7 +336,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return DataFrame(columns) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Reduce(IR): """ Produce a new dataframe selecting given expressions from an input. @@ -389,7 +389,7 @@ def placeholder_column(n: int) -> plc.Column: ) -@dataclasses.dataclass(slots=False) +@dataclasses.dataclass class GroupBy(IR): """Perform a groupby.""" @@ -490,7 +490,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return DataFrame([*result_keys, *results]).slice(self.options.slice) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Join(IR): """A join of two dataframes.""" @@ -518,8 +518,16 @@ class Join(IR): - coalesce: should key columns be coalesced (only makes sense for outer joins) """ - @cache + def __post_init__(self) -> None: + """Validate preconditions.""" + if any( + isinstance(e.value, expr.Literal) + for e in itertools.chain(self.left_on, self.right_on) + ): + raise NotImplementedError("Join with literal as join key.") + @staticmethod + @cache def _joiners( how: Literal["inner", "left", "full", "leftsemi", "leftanti"], ) -> tuple[ @@ -582,17 +590,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: for new, old in zip(columns[left.num_columns :], right.columns) ] return DataFrame([*left_cols, *right_cols]) - left_on = DataFrame( - broadcast( - *(e.evaluate(left) for e in self.left_on), target_length=left.num_rows - ) - ) - right_on = DataFrame( - broadcast( - *(e.evaluate(right) for e in self.right_on), - target_length=right.num_rows, - ) - ) + # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184 + left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on))) + right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on))) null_equality = ( plc.types.NullEquality.EQUAL if join_nulls @@ -602,13 +602,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: if right_policy is None: # Semi join lg = join_fn(left_on.table, right_on.table, null_equality) - left = left.replace_columns(*left_on.columns) table = plc.copying.gather(left.table, lg, left_policy) result = DataFrame.from_table(table, left.column_names) else: lg, rg = join_fn(left_on.table, right_on.table, null_equality) - left = left.replace_columns(*left_on.columns) - right = right.replace_columns(*right_on.columns) if coalesce and how == "inner": right = right.discard_columns(right_on.column_names_set) left = DataFrame.from_table( @@ -642,7 +639,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return result.slice(zlice) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class HStack(IR): """Add new columns to a dataframe.""" @@ -671,7 +668,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return df.with_columns(columns) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Distinct(IR): """Produce a new dataframe with distinct rows.""" @@ -741,7 +738,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return result.slice(self.zlice) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Sort(IR): """Sort a dataframe.""" @@ -810,7 +807,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return DataFrame(columns).slice(self.zlice) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Slice(IR): """Slice a dataframe.""" @@ -827,7 +824,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return df.slice((self.offset, self.length)) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Filter(IR): """Filter a dataframe with a boolean mask.""" @@ -843,7 +840,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return df.filter(mask) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Projection(IR): """Select a subset of columns from a dataframe.""" @@ -860,7 +857,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return DataFrame(columns) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class MapFunction(IR): """Apply some function to a dataframe.""" @@ -894,6 +891,13 @@ def __post_init__(self) -> None: # polars requires that all to-explode columns have the # same sub-shapes raise NotImplementedError("Explode with more than one column") + elif self.name == "rename": + old, new, _ = self.options + # TODO: perhaps polars should validate renaming in the IR? + if len(new) != len(set(new)) or ( + set(new) & (set(self.df.schema.keys() - set(old))) + ): + raise NotImplementedError("Duplicate new names in rename.") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -919,7 +923,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: raise AssertionError("Should never be reached") # pragma: no cover -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class Union(IR): """Concatenate dataframes vertically.""" @@ -943,7 +947,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ).slice(self.zlice) -@dataclasses.dataclass(slots=True) +@dataclasses.dataclass class HConcat(IR): """Concatenate dataframes horizontally.""" diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index 6d597a91724..c04eac41bb7 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -6,7 +6,7 @@ from __future__ import annotations from collections.abc import Mapping -from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias +from typing import TYPE_CHECKING, Literal, Protocol, Union from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir @@ -15,43 +15,45 @@ if TYPE_CHECKING: from typing import Callable + from typing_extensions import TypeAlias + import polars as pl -IR: TypeAlias = ( - pl_ir.PythonScan - | pl_ir.Scan - | pl_ir.Cache - | pl_ir.DataFrameScan - | pl_ir.Select - | pl_ir.GroupBy - | pl_ir.Join - | pl_ir.HStack - | pl_ir.Distinct - | pl_ir.Sort - | pl_ir.Slice - | pl_ir.Filter - | pl_ir.SimpleProjection - | pl_ir.MapFunction - | pl_ir.Union - | pl_ir.HConcat - | pl_ir.ExtContext -) - -Expr: TypeAlias = ( - pl_expr.Function - | pl_expr.Window - | pl_expr.Literal - | pl_expr.Sort - | pl_expr.SortBy - | pl_expr.Gather - | pl_expr.Filter - | pl_expr.Cast - | pl_expr.Column - | pl_expr.Agg - | pl_expr.BinaryExpr - | pl_expr.Len - | pl_expr.PyExprIR -) +IR: TypeAlias = Union[ + pl_ir.PythonScan, + pl_ir.Scan, + pl_ir.Cache, + pl_ir.DataFrameScan, + pl_ir.Select, + pl_ir.GroupBy, + pl_ir.Join, + pl_ir.HStack, + pl_ir.Distinct, + pl_ir.Sort, + pl_ir.Slice, + pl_ir.Filter, + pl_ir.SimpleProjection, + pl_ir.MapFunction, + pl_ir.Union, + pl_ir.HConcat, + pl_ir.ExtContext, +] + +Expr: TypeAlias = Union[ + pl_expr.Function, + pl_expr.Window, + pl_expr.Literal, + pl_expr.Sort, + pl_expr.SortBy, + pl_expr.Gather, + pl_expr.Filter, + pl_expr.Cast, + pl_expr.Column, + pl_expr.Agg, + pl_expr.BinaryExpr, + pl_expr.Len, + pl_expr.PyExprIR, +] Schema: TypeAlias = Mapping[str, plc.DataType] diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 2ffa1c4af6d..267d0a99692 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -52,7 +52,7 @@ def test_agg(df, agg): # https://github.com/rapidsai/cudf/issues/15852 check_dtypes = agg not in {"n_unique", "median"} - if not check_dtypes and q.schema["a"] != pl.Float64: + if not check_dtypes and q.collect_schema()["a"] != pl.Float64: with pytest.raises(AssertionError): assert_gpu_result_equal(q) assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) @@ -65,7 +65,7 @@ def test_agg(df, agg): ) @pytest.mark.parametrize("op", ["min", "max"]) def test_agg_float_with_nans(propagate_nans, op): - df = pl.LazyFrame({"a": [1, 2, float("nan")]}) + df = pl.LazyFrame({"a": pl.Series([1, 2, float("nan")], dtype=pl.Float64())}) op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op) q = df.select(op(pl.col("a"))) diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py index 951b749e670..a52fba26528 100644 --- a/python/cudf_polars/tests/expressions/test_booleanfunction.py +++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py @@ -26,7 +26,7 @@ def has_nulls(request): def test_booleanfunction_reduction(ignore_nulls): ldf = pl.LazyFrame( { - "a": [1, 2, 3.0, 2, 5], + "a": pl.Series([1, 2, 3.0, 2, 5], dtype=pl.Float64()), "b": [0, 3, 1, -1, None], "c": [1, 6, 5, 3, 2], } @@ -82,7 +82,9 @@ def test_boolean_function_unary(request, expr, has_nans, has_nulls): ], ) def test_unsupported_boolean_function(expr): - df = pl.LazyFrame({"a": [1, float("nan"), 2, 4], "b": [1, 2, 3, 4]}) + df = pl.LazyFrame( + {"a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float64()), "b": [1, 2, 3, 4]} + ) q = df.select(expr) @@ -95,7 +97,11 @@ def test_unsupported_boolean_function(expr): ) def test_boolean_isbetween(closed, bounds): df = pl.LazyFrame( - {"a": [1, float("nan"), 2, 4], "lo": [1, 2, 2, 3], "hi": [10, 4, 2, 4]} + { + "a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float32()), + "lo": [1, 2, 2, 3], + "hi": [10, 4, 2, 4], + } ) q = df.select(pl.col("a").is_between(*bounds, closed=closed)) diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py index d4920d35f14..992efe0ba79 100644 --- a/python/cudf_polars/tests/expressions/test_rolling.py +++ b/python/cudf_polars/tests/expressions/test_rolling.py @@ -3,11 +3,9 @@ from __future__ import annotations -import pytest - import polars as pl -from cudf_polars import translate_ir +from cudf_polars.testing.asserts import assert_ir_translation_raises def test_rolling(): @@ -29,13 +27,13 @@ def test_rolling(): min_a=pl.min("a").rolling(index_column="dt", period="2d"), max_a=pl.max("a").rolling(index_column="dt", period="2d"), ) - with pytest.raises(NotImplementedError): - _ = translate_ir(q._ldf.visit()) + + assert_ir_translation_raises(q, NotImplementedError) def test_grouped_rolling(): df = pl.LazyFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 1, 3, 1, 2]}) q = df.select(pl.col("a").min().over("b")) - with pytest.raises(NotImplementedError): - _ = translate_ir(q._ldf.visit()) + + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 3c498fe7286..9729e765948 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -8,8 +8,11 @@ import polars as pl -from cudf_polars import execute_with_cudf, translate_ir -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars import execute_with_cudf +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) @pytest.fixture @@ -47,22 +50,19 @@ def test_supported_stringfunction_expression(ldf): def test_unsupported_stringfunction(ldf): q = ldf.select(pl.col("a").str.count_matches("e", literal=True)) - with pytest.raises(NotImplementedError): - _ = translate_ir(q._ldf.visit()) + assert_ir_translation_raises(q, NotImplementedError) def test_contains_re_non_strict_raises(ldf): q = ldf.select(pl.col("a").str.contains(".", strict=False)) - with pytest.raises(NotImplementedError): - _ = translate_ir(q._ldf.visit()) + assert_ir_translation_raises(q, NotImplementedError) def test_contains_re_non_literal_raises(ldf): q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False)) - with pytest.raises(NotImplementedError): - _ = translate_ir(q._ldf.visit()) + assert_ir_translation_raises(q, NotImplementedError) @pytest.mark.parametrize( diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index e70f923b097..aefad59eb91 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -6,8 +6,10 @@ import polars as pl -from cudf_polars import translate_ir -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) @pytest.fixture @@ -72,7 +74,7 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs): q = df.group_by(*keys, maintain_order=maintain_order).agg(*exprs) if not maintain_order: - sort_keys = list(q.schema.keys())[: len(keys)] + sort_keys = list(q.collect_schema().keys())[: len(keys)] q = q.sort(*sort_keys) assert_gpu_result_equal(q, check_exact=False) @@ -97,5 +99,4 @@ def test_groupby_len(df, keys): def test_groupby_unsupported(df, expr): q = df.group_by("key1").agg(expr) - with pytest.raises(NotImplementedError): - _ = translate_ir(q._ldf.visit()) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 81166b0b2f6..89f6fd3455b 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -6,7 +6,10 @@ import polars as pl -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) @pytest.mark.parametrize( @@ -71,3 +74,14 @@ def test_cross_join(): q = left.join(right, how="cross") assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))] +) +def test_join_literal_key_unsupported(left_on, right_on): + left = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) + right = pl.LazyFrame({"a": [1, 2, 3], "b": [5, 6, 7]}) + q = left.join(right, left_on=left_on, right_on=right_on, how="inner") + + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index ec6b3f3fc0a..77032108e6f 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -6,8 +6,10 @@ import polars as pl -from cudf_polars import translate_ir -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) def test_merge_sorted_raises(): @@ -17,16 +19,14 @@ def test_merge_sorted_raises(): q = df1.merge_sorted(df2, key="a").merge_sorted(df3, key="a") - with pytest.raises(NotImplementedError): - _ = translate_ir(q._ldf.visit()) + assert_ir_translation_raises(q, NotImplementedError) def test_explode_multiple_raises(): df = pl.LazyFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]}) q = df.explode("a", "b") - with pytest.raises(NotImplementedError): - _ = translate_ir(q._ldf.visit()) + assert_ir_translation_raises(q, NotImplementedError) @pytest.mark.parametrize("column", ["a", "b"]) @@ -41,3 +41,23 @@ def test_explode_single(column): q = df.explode(column) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("mapping", [{"b": "a"}, {"a": "c", "b": "c"}]) +def test_rename_duplicate_raises(mapping): + df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) + + q = df.rename(mapping) + + assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize( + "mapping", [{}, {"b": "c"}, {"b": "a", "a": "b"}, {"a": "c", "b": "d"}] +) +def test_rename_columns(mapping): + df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) + + q = df.rename(mapping) + + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py index c03474e3dc8..fd8453b77c4 100644 --- a/python/cudf_polars/tests/test_python_scan.py +++ b/python/cudf_polars/tests/test_python_scan.py @@ -2,11 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import pytest - import polars as pl -from cudf_polars import translate_ir +from cudf_polars.testing.asserts import assert_ir_translation_raises def test_python_scan(): @@ -14,7 +12,6 @@ def source(with_columns, predicate, nrows): return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())}) q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False) - with pytest.raises(NotImplementedError): - _ = translate_ir(q._ldf.visit()) + assert_ir_translation_raises(q, NotImplementedError) assert q.collect().equals(source(None, None, None)) diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index 6c9122bc260..b021d832910 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -2,12 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import pytest - import polars as pl -from cudf_polars import translate_ir -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) def test_union(): @@ -31,8 +31,8 @@ def test_union_schema_mismatch_raises(): ).lazy() ldf2 = ldf.select(pl.col("a").cast(pl.Float32)) query = pl.concat([ldf, ldf2], how="diagonal") - with pytest.raises(NotImplementedError): - _ = translate_ir(query._ldf.visit()) + + assert_ir_translation_raises(query, NotImplementedError) def test_concat_vertical():