diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml index 836c3e65..fbb892d5 100644 --- a/.github/workflows/tox.yml +++ b/.github/workflows/tox.yml @@ -9,7 +9,7 @@ jobs: tox: strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] os: [windows-latest, ubuntu-latest] runs-on: ${{ matrix.os }} diff --git a/dataframe_api_compat/pandas_standard/__init__.py b/dataframe_api_compat/pandas_standard/__init__.py index c82b3037..d6f619c8 100644 --- a/dataframe_api_compat/pandas_standard/__init__.py +++ b/dataframe_api_compat/pandas_standard/__init__.py @@ -1,6 +1,8 @@ from __future__ import annotations +import re from typing import Any +from typing import Literal from typing import TYPE_CHECKING import pandas as pd @@ -10,12 +12,24 @@ from dataframe_api_compat.pandas_standard.pandas_standard import PandasColumn from dataframe_api_compat.pandas_standard.pandas_standard import PandasDataFrame from dataframe_api_compat.pandas_standard.pandas_standard import PandasGroupBy +from dataframe_api_compat.pandas_standard.pandas_standard import PandasPermissiveColumn +from dataframe_api_compat.pandas_standard.pandas_standard import PandasPermissiveFrame if TYPE_CHECKING: from collections.abc import Sequence + from dataframe_api._types import DType + + +def col(name: str) -> PandasColumn: + return PandasColumn( + root_names=[name], output_name=name, base_call=lambda df: df.loc[:, name] + ) + Column = PandasColumn +PermissiveColumn = PandasPermissiveColumn DataFrame = PandasDataFrame +PermissiveFrame = PandasPermissiveFrame GroupBy = PandasGroupBy @@ -67,35 +81,82 @@ class String: ... -DTYPE_MAP = { - "int64": Int64(), - "Int64": Int64(), - "int32": Int32(), - "Int32": Int32(), - "int16": Int16(), - "Int16": Int16(), - "int8": Int8(), - "Int8": Int8(), - "uint64": UInt64(), - "UInt64": UInt64(), - "uint32": UInt32(), - "UInt32": UInt32(), - "uint16": UInt16(), - "UInt16": UInt16(), - "uint8": UInt8(), - "UInt8": UInt8(), - "float64": Float64(), - "Float64": Float64(), - "float32": Float32(), - "Float32": Float32(), - "bool": Bool(), - "boolean": Bool(), - "object": String(), - "string": String(), -} - - -def map_standard_dtype_to_pandas_dtype(dtype: Any) -> Any: +class Date: + ... + + +class Datetime: + def __init__(self, time_unit, time_zone=None): + self.time_unit = time_unit + # todo validate time zone + self.time_zone = time_zone + + +class Duration: + def __init__(self, time_unit): + self.time_unit = time_unit + + +def map_pandas_dtype_to_standard_dtype(dtype: Any) -> DType: + if dtype == "int64": + return Int64() + if dtype == "Int64": + return Int64() + if dtype == "int32": + return Int32() + if dtype == "Int32": + return Int32() + if dtype == "int16": + return Int16() + if dtype == "Int16": + return Int16() + if dtype == "int8": + return Int8() + if dtype == "Int8": + return Int8() + if dtype == "uint64": + return UInt64() + if dtype == "UInt64": + return UInt64() + if dtype == "uint32": + return UInt32() + if dtype == "UInt32": + return UInt32() + if dtype == "uint16": + return UInt16() + if dtype == "UInt16": + return UInt16() + if dtype == "uint8": + return UInt8() + if dtype == "UInt8": + return UInt8() + if dtype == "float64": + return Float64() + if dtype == "Float64": + return Float64() + if dtype == "float32": + return Float32() + if dtype == "Float32": + return Float32() + if dtype == "bool": + # 'boolean' not yet covered, as the default dtype in pandas is still 'bool' + return Bool() + if dtype == "object": + return String() + if dtype == "string": + return String() + if dtype == "datetime64[s]": + return Date() + if dtype.startswith("datetime64["): + time_unit = re.search(r"datetime64\[(\w{1,2})", dtype).group(1) + return Datetime(time_unit) + if dtype.startswith("timedelta64["): + time_unit = re.search(r"timedelta64\[(\w{1,2})", dtype).group(1) + return Duration(time_unit) + raise AssertionError(f"Unsupported dtype! {dtype}") + + +def map_standard_dtype_to_pandas_dtype(dtype: DType) -> Any: if isinstance(dtype, Int64): return "int64" if isinstance(dtype, Int32): @@ -120,9 +181,26 @@ def map_standard_dtype_to_pandas_dtype(dtype: Any) -> Any: return "bool" if isinstance(dtype, String): return "object" + if isinstance(dtype, Datetime): + if dtype.time_zone is not None: # pragma: no cover (todo) + return f"datetime64[{dtype.time_unit}, {dtype.time_zone}]" + return f"datetime64[{dtype.time_unit}]" + if isinstance(dtype, Duration): + return f"timedelta64[{dtype.time_unit}]" raise AssertionError(f"Unknown dtype: {dtype}") +def convert_to_standard_compliant_column( + ser: pd.Series, api_version: str | None = None +) -> PandasDataFrame: + if api_version is None: # pragma: no cover + api_version = LATEST_API_VERSION + if ser.name is not None and not isinstance(ser.name, str): + raise ValueError(f"Expected column with string name, got: {ser.name}") + name = ser.name or "" + return PandasPermissiveColumn(ser.rename(name), api_version=api_version) + + def convert_to_standard_compliant_dataframe( df: pd.DataFrame, api_version: str | None = None ) -> PandasDataFrame: @@ -131,13 +209,6 @@ def convert_to_standard_compliant_dataframe( return PandasDataFrame(df, api_version=api_version) -def convert_to_standard_compliant_column( - df: pd.Series[Any], - api_version: str | None = None, -) -> PandasColumn[Any]: - return PandasColumn(df, api_version=api_version or LATEST_API_VERSION) - - def concat(dataframes: Sequence[PandasDataFrame]) -> PandasDataFrame: dtypes = dataframes[0].dataframe.dtypes dfs = [] @@ -164,16 +235,30 @@ def concat(dataframes: Sequence[PandasDataFrame]) -> PandasDataFrame: def column_from_sequence( sequence: Sequence[Any], *, dtype: Any, name: str, api_version: str | None = None -) -> PandasColumn[Any]: +) -> PandasPermissiveColumn[Any]: ser = pd.Series(sequence, dtype=map_standard_dtype_to_pandas_dtype(dtype), name=name) - return PandasColumn(ser, api_version=LATEST_API_VERSION) + return PandasPermissiveColumn(ser, api_version=api_version or LATEST_API_VERSION) + + +def dataframe_from_dict( + data: dict[str, PandasPermissiveColumn[Any]], api_version: str | None = None +) -> PandasDataFrame: + for _, col in data.items(): + if not isinstance(col, PandasPermissiveColumn): # pragma: no cover + raise TypeError(f"Expected PandasPermissiveColumn, got {type(col)}") + return PandasDataFrame( + pd.DataFrame( + {label: column.column.rename(label) for label, column in data.items()} + ), + api_version=api_version or LATEST_API_VERSION, + ) def column_from_1d_array( data: Any, *, dtype: Any, name: str | None = None, api_version: str | None = None -) -> PandasColumn[Any]: # pragma: no cover +) -> PandasPermissiveColumn[Any]: # pragma: no cover ser = pd.Series(data, dtype=map_standard_dtype_to_pandas_dtype(dtype), name=name) - return PandasColumn(ser, api_version=api_version or LATEST_API_VERSION) + return PandasPermissiveColumn(ser, api_version=api_version or LATEST_API_VERSION) def dataframe_from_2d_array( @@ -189,20 +274,6 @@ def dataframe_from_2d_array( return PandasDataFrame(df, api_version=api_version or LATEST_API_VERSION) -def dataframe_from_dict( - data: dict[str, PandasColumn[Any]], api_version: str | None = None -) -> PandasDataFrame: - for _, col in data.items(): - if not isinstance(col, PandasColumn): # pragma: no cover - raise TypeError(f"Expected PandasColumn, got {type(col)}") - return PandasDataFrame( - pd.DataFrame( - {label: column.column.rename(label) for label, column in data.items()} - ), - api_version=api_version or LATEST_API_VERSION, - ) - - def is_null(value: Any) -> bool: return value is null @@ -223,3 +294,47 @@ def is_dtype(dtype: Any, kind: str | tuple[str, ...]) -> bool: if _kind == "string": dtypes.add(String) return isinstance(dtype, tuple(dtypes)) + + +def any_rowwise(*columns: str, skip_nulls: bool = True) -> PandasColumn: + # todo: accept expressions + def func(df): + return df.loc[:, list(columns) or df.columns.tolist()].any(axis=1) + + return PandasColumn(root_names=list(columns), output_name="any", base_call=func) + + +def all_rowwise(*columns: str, skip_nulls: bool = True) -> PandasColumn: + def func(df: pd.DataFrame) -> pd.Series: + return df.loc[:, list(columns) or df.columns.tolist()].all(axis=1) + + return PandasColumn(root_names=list(columns), output_name="all", base_call=func) + + +def sorted_indices( + *keys: str, + ascending: Sequence[bool] | bool = True, + nulls_position: Literal["first", "last"] = "last", +) -> Column: + def func(df: pd.DataFrame) -> pd.Series: + if ascending: + return ( + df.loc[:, list(keys)] + .sort_values(list(keys)) + .index.to_series() + .reset_index(drop=True) + ) + return ( + df.loc[:, list(keys)] + .sort_values(list(keys)) + .index.to_series()[::-1] + .reset_index(drop=True) + ) + + return PandasColumn(root_names=list(keys), output_name="indices", base_call=func) + + +def unique_indices( + keys: str | list[str] | None = None, *, skip_nulls: bool = True +) -> Column: + raise NotImplementedError("namespace.unique_indices not implemented for pandas yet") diff --git a/dataframe_api_compat/pandas_standard/pandas_standard.py b/dataframe_api_compat/pandas_standard/pandas_standard.py index 3a45523f..6dbdfcde 100644 --- a/dataframe_api_compat/pandas_standard/pandas_standard.py +++ b/dataframe_api_compat/pandas_standard/pandas_standard.py @@ -2,6 +2,7 @@ import collections from typing import Any +from typing import Callable from typing import cast from typing import Generic from typing import Literal @@ -45,319 +46,372 @@ class Null: from dataframe_api import ( Bool, + PermissiveColumn, Column, DataFrame, + PermissiveFrame, GroupBy, ) + + ExtraCall = tuple[ + Callable[[pd.Series, pd.Series | None], pd.Series], pd.Series, pd.Series + ] + else: - class DataFrame(Generic[DType]): + class DataFrame: + ... + + class PermissiveFrame: ... - class Column(Generic[DType]): + class PermissiveColumn(Generic[DType]): + ... + + class Column: ... class GroupBy: ... + class Bool: + ... -class PandasColumn(Column[DType]): - # private, not technically part of the standard - def __init__(self, column: pd.Series[Any], api_version: str) -> None: - if ( - isinstance(column.index, pd.RangeIndex) - and column.index.start == 0 # type: ignore[comparison-overlap] - and column.index.step == 1 # type: ignore[comparison-overlap] - and (column.index.stop == len(column)) # type: ignore[comparison-overlap] - ): - self._series = column - else: - self._series = column.reset_index(drop=True) - self._api_version = api_version - if api_version not in SUPPORTED_VERSIONS: - raise ValueError( - "Unsupported API version, expected one of: " - f"{SUPPORTED_VERSIONS}. " - "Try updating dataframe-api-compat?" - ) - def _validate_index(self, index: pd.Index) -> None: - pd.testing.assert_index_equal(self.column.index, index) +class PandasColumn(Column): + def __init__( + self, + root_names: list[str] | None, + output_name: str, + base_call: Callable[[pd.DataFrame], pd.Series] | None = None, + extra_calls: list[ExtraCall] | None = None, + *, + api_version: str | None = None, # todo: propagate + ) -> None: + """ + Parameters + ---------- + root_names + Columns from DataFrame to consider as inputs to expression. + If `None`, all input columns are considered. + output_name + Name of resulting column. + base_call + Call to be applied to DataFrame. Should return a Series. + extra_calls + Extra calls to chain to output of `base_call`. Must take Series + and output Series. + """ + self._base_call = base_call + self._calls = extra_calls or [] + self._root_names = root_names + self._output_name = output_name + # TODO: keep track of output name # In the standard def __column_namespace__(self) -> Any: return dataframe_api_compat.pandas_standard @property - def name(self) -> str: - return self.column.name # type: ignore[return-value] + def root_names(self): + return sorted(set(self._root_names)) @property - def column(self) -> pd.Series[Any]: - return self._series + def output_name(self): + return self._output_name - def __len__(self) -> int: - return len(self.column) - - def __iter__(self) -> NoReturn: - raise NotImplementedError() + def _record_call( + self, + func: Callable[[pd.Series, pd.Series | None], pd.Series], + rhs: pd.Series | None, + output_name: str | None = None, + ) -> PandasColumn: + calls = [*self._calls, (func, self, rhs)] + if isinstance(rhs, PandasColumn): + root_names = self.root_names + rhs.root_names + else: + root_names = self.root_names + return PandasColumn( + root_names=root_names, + output_name=output_name or self.output_name, + extra_calls=calls, + ) - @property - def dtype(self) -> Any: - return dataframe_api_compat.pandas_standard.DTYPE_MAP[self.column.dtype.name] + def get_rows(self, indices: Column | PermissiveColumn[Any]) -> PandasColumn: + def func(lhs: pd.Series, rhs: pd.Series) -> pd.Series: + return lhs.iloc[rhs].reset_index(drop=True) - def get_rows(self, indices: Column[Any]) -> PandasColumn[DType]: - return PandasColumn( - self.column.iloc[indices.column.to_numpy()], api_version=self._api_version + return self._record_call( + func, + indices, ) def slice_rows( self, start: int | None, stop: int | None, step: int | None ) -> PandasColumn[DType]: - if start is None: - start = 0 - if stop is None: - stop = len(self.column) - if step is None: - step = 1 - return PandasColumn( - self.column.iloc[start:stop:step], api_version=self._api_version + def func(ser, _rhs, start, stop, step): + if start is None: + start = 0 + if stop is None: + stop = len(ser) + if step is None: + step = 1 + return ser.iloc[start:stop:step] + + import functools + + return self._record_call( + functools.partial(func, start=start, stop=stop, step=step), None + ) + + def len(self) -> PandasColumn: + return self._record_call( + lambda ser, _rhs: pd.Series([len(ser)], name=ser.name), None ) - def filter(self, mask: Column[Bool]) -> PandasColumn[DType]: - series = mask.column - self._validate_index(series.index) - return PandasColumn(self.column.loc[series], api_version=self._api_version) + def filter(self, mask: Column | PermissiveColumn[Any]) -> PandasColumn: + return self._record_call(lambda ser, mask: ser.loc[mask], mask) def get_value(self, row: int) -> Any: - return self.column.iloc[row] + return self._record_call(lambda ser, _rhs: ser.iloc[[row]], None) - def __eq__( # type: ignore[override] - self, other: PandasColumn[DType] | Any - ) -> PandasColumn[Bool]: - if isinstance(other, PandasColumn): - return PandasColumn( - self.column == other.column, api_version=self._api_version - ) - return PandasColumn(self.column == other, api_version=self._api_version) + def __eq__(self, other: PandasColumn | Any) -> PandasColumn: # type: ignore[override] + return self._record_call( + lambda ser, other: (ser == other).rename(ser.name), other + ) - def __ne__( # type: ignore[override] - self, other: Column[DType] - ) -> PandasColumn[Bool]: - if isinstance(other, PandasColumn): - return PandasColumn( - self.column != other.column, api_version=self._api_version - ) - return PandasColumn(self.column != other, api_version=self._api_version) + def __ne__(self, other: Column | PermissiveColumn[Any]) -> PandasColumn: # type: ignore[override] + return self._record_call( + lambda ser, other: (ser != other).rename(ser.name), other + ) - def __ge__(self, other: Column[DType] | Any) -> PandasColumn[Bool]: - if isinstance(other, PandasColumn): - return PandasColumn( - self.column >= other.column, api_version=self._api_version - ) - return PandasColumn(self.column >= other, api_version=self._api_version) + def __ge__(self, other: Column | Any) -> PandasColumn: + return self._record_call( + lambda ser, other: (ser >= other).rename(ser.name), other + ) - def __gt__(self, other: Column[DType] | Any) -> PandasColumn[Bool]: - if isinstance(other, PandasColumn): - return PandasColumn(self.column > other.column, api_version=self._api_version) - return PandasColumn(self.column > other, api_version=self._api_version) + def __gt__(self, other: Column | Any) -> PandasColumn: + return self._record_call(lambda ser, other: (ser > other).rename(ser.name), other) - def __le__(self, other: Column[DType] | Any) -> PandasColumn[Bool]: - if isinstance(other, PandasColumn): - return PandasColumn( - self.column <= other.column, api_version=self._api_version - ) - return PandasColumn(self.column <= other, api_version=self._api_version) - - def __lt__(self, other: Column[DType] | Any) -> PandasColumn[Bool]: - if isinstance(other, PandasColumn): - return PandasColumn(self.column < other.column, api_version=self._api_version) - return PandasColumn(self.column < other, api_version=self._api_version) - - def __and__(self, other: Column[Bool] | bool) -> PandasColumn[Bool]: - if isinstance(other, PandasColumn): - return PandasColumn(self.column & other.column, api_version=self._api_version) - result = self.column & other # type: ignore[operator] - return PandasColumn(result, api_version=self._api_version) - - def __or__(self, other: Column[Bool] | bool) -> PandasColumn[Bool]: - if isinstance(other, PandasColumn): - return PandasColumn(self.column | other.column, api_version=self._api_version) - return PandasColumn(self.column | other, api_version=self._api_version) # type: ignore[operator] - - def __add__(self, other: Column[DType] | Any) -> PandasColumn[DType]: - if isinstance(other, PandasColumn): - return PandasColumn(self.column + other.column, api_version=self._api_version) - return PandasColumn(self.column + other, api_version=self._api_version) # type: ignore[operator] - - def __sub__(self, other: Column[DType] | Any) -> PandasColumn[DType]: - if isinstance(other, PandasColumn): - return PandasColumn(self.column - other.column, api_version=self._api_version) - return PandasColumn(self.column - other, api_version=self._api_version) # type: ignore[operator] - - def __mul__(self, other: Column[DType] | Any) -> PandasColumn[Any]: - if isinstance(other, PandasColumn): - return PandasColumn(self.column * other.column, api_version=self._api_version) - return PandasColumn(self.column * other, api_version=self._api_version) # type: ignore[operator] - - def __truediv__(self, other: Column[DType] | Any) -> PandasColumn[Any]: - if isinstance(other, PandasColumn): - return PandasColumn(self.column / other.column, api_version=self._api_version) - return PandasColumn(self.column / other, api_version=self._api_version) # type: ignore[operator] - - def __floordiv__(self, other: Column[DType] | Any) -> PandasColumn[Any]: - if isinstance(other, PandasColumn): - return PandasColumn( - self.column // other.column, api_version=self._api_version - ) - return PandasColumn(self.column // other, api_version=self._api_version) # type: ignore[operator] + def __le__(self, other: Column | Any) -> PandasColumn: + return self._record_call( + lambda ser, other: (ser <= other).rename(ser.name), other + ) - def __pow__(self, other: Column[DType] | Any) -> PandasColumn[Any]: - if isinstance(other, PandasColumn): - return PandasColumn( - self.column**other.column, api_version=self._api_version - ) - return PandasColumn(self.column**other, api_version=self._api_version) # type: ignore[operator] + def __lt__(self, other: Column | Any) -> PandasColumn: + return self._record_call(lambda ser, other: (ser < other).rename(ser.name), other) - def __mod__(self, other: Column[DType] | Any) -> PandasColumn[Any]: - if isinstance(other, PandasColumn): - return PandasColumn(self.column % other.column, api_version=self._api_version) - return PandasColumn(self.column % other, api_version=self._api_version) # type: ignore[operator] + def __and__(self, other: Column | bool) -> PandasColumn: + return self._record_call(lambda ser, other: (ser & other).rename(ser.name), other) - def __divmod__( - self, other: Column[DType] | Any - ) -> tuple[PandasColumn[Any], PandasColumn[Any]]: - if isinstance(other, PandasColumn): - quotient, remainder = self.column.__divmod__(other.column) - else: - quotient, remainder = self.column.__divmod__(other) - return PandasColumn(quotient, api_version=self._api_version), PandasColumn( - remainder, api_version=self._api_version + def __or__(self, other: Column | bool) -> PandasColumn: + return self._record_call(lambda ser, other: (ser | other).rename(ser.name), other) + + def __add__(self, other: Column | Any) -> PandasColumn: + return self._record_call( + lambda ser, other: ((ser + other).rename(ser.name)).rename(ser.name), other ) - def __invert__(self: PandasColumn[Bool]) -> PandasColumn[Bool]: - return PandasColumn(~self.column, api_version=self._api_version) + def __sub__(self, other: Column | Any) -> PandasColumn: + return self._record_call(lambda ser, other: (ser - other).rename(ser.name), other) - def any(self, *, skip_nulls: bool = True) -> bool: - return self.column.any() + def __mul__(self, other: Column | Any) -> PandasColumn: + return self._record_call(lambda ser, other: (ser * other).rename(ser.name), other) - def all(self, *, skip_nulls: bool = True) -> bool: - return self.column.all() + def __truediv__(self, other: Column | Any) -> PandasColumn: + return self._record_call(lambda ser, other: (ser / other).rename(ser.name), other) + + def __floordiv__(self, other: Column | Any) -> PandasColumn: + return self._record_call( + lambda ser, other: (ser // other).rename(ser.name), other + ) + + def __pow__(self, other: Column | Any) -> PandasColumn: + return self._record_call( + lambda ser, other: (ser**other).rename(ser.name), other + ) + + def __mod__(self, other: Column | Any) -> PandasColumn: + return self._record_call(lambda ser, other: (ser % other).rename(ser.name), other) + + def __divmod__(self, other: Column | Any) -> tuple[PandasColumn, PandasColumn]: + quotient = self // other + remainder = self - quotient * other + return quotient, remainder + + def __invert__(self: PandasColumn) -> PandasColumn: + return self._record_call(lambda ser, _rhs: ~ser, None) + + # Reductions + + def any(self, *, skip_nulls: bool = True) -> PandasColumn: + return self._record_call( + lambda ser, _rhs: pd.Series([ser.any()], name=ser.name), None + ) + + def all(self, *, skip_nulls: bool = True) -> PandasColumn: + return self._record_call( + lambda ser, _rhs: pd.Series([ser.all()], name=ser.name), None + ) def min(self, *, skip_nulls: bool = True) -> Any: - return self.column.min() + return self._record_call( + lambda ser, _rhs: pd.Series([ser.min()], name=ser.name), None + ) def max(self, *, skip_nulls: bool = True) -> Any: - return self.column.max() + return self._record_call( + lambda ser, _rhs: pd.Series([ser.max()], name=ser.name), None + ) def sum(self, *, skip_nulls: bool = True) -> Any: - return self.column.sum() + return self._record_call( + lambda ser, _rhs: pd.Series([ser.sum()], name=ser.name), None + ) def prod(self, *, skip_nulls: bool = True) -> Any: - return self.column.prod() + return self._record_call( + lambda ser, _rhs: pd.Series([ser.prod()], name=ser.name), None + ) def median(self, *, skip_nulls: bool = True) -> Any: - return self.column.median() + return self._record_call( + lambda ser, _rhs: pd.Series([ser.median()], name=ser.name), None + ) def mean(self, *, skip_nulls: bool = True) -> Any: - return self.column.mean() + return self._record_call( + lambda ser, _rhs: pd.Series([ser.mean()], name=ser.name), None + ) def std(self, *, correction: int | float = 1.0, skip_nulls: bool = True) -> Any: - return self.column.std() + return self._record_call( + lambda ser, _rhs: pd.Series([ser.std()], name=ser.name), None + ) def var(self, *, correction: int | float = 1.0, skip_nulls: bool = True) -> Any: - return self.column.var() + return self._record_call( + lambda ser, _rhs: pd.Series([ser.var()], name=ser.name), None + ) - def is_null(self) -> PandasColumn[Bool]: - return PandasColumn(self.column.isna(), api_version=self._api_version) + def is_null(self) -> PandasColumn: + return self._record_call(lambda ser, _rhs: ser.isna(), None) - def is_nan(self) -> PandasColumn[Bool]: - if is_extension_array_dtype(self.column.dtype): - return PandasColumn( - np.isnan(self.column).replace(pd.NA, False).astype(bool), - api_version=self._api_version, - ) - return PandasColumn(self.column.isna(), api_version=self._api_version) + def is_nan(self) -> PandasColumn: + def func(ser, _rhs): + if is_extension_array_dtype(ser.dtype): + return np.isnan(ser).replace(pd.NA, False).astype(bool) + return ser.isna() - def sorted_indices( - self, *, ascending: bool = True, nulls_position: Literal["first", "last"] = "last" - ) -> PandasColumn[Any]: - if ascending: - return PandasColumn( - pd.Series(self.column.argsort()), api_version=self._api_version - ) - return PandasColumn( - pd.Series(self.column.argsort()[::-1]), api_version=self._api_version - ) + return self._record_call(func, None) def sort( self, *, ascending: bool = True, nulls_position: Literal["first", "last"] = "last" - ) -> PandasColumn[Any]: - if self._api_version == "2023.08-beta": - raise NotImplementedError("dataframe.sort only available after 2023.08-beta") - return PandasColumn( - self.column.sort_values(ascending=ascending), api_version=self._api_version + ) -> PandasColumn: + return self._record_call( + lambda ser, _rhs: ser.sort_values(ascending=ascending).reset_index(drop=True), + None, ) - def is_in(self, values: Column[DType]) -> PandasColumn[Bool]: - if values.dtype != self.dtype: - raise ValueError(f"`value` has dtype {values.dtype}, expected {self.dtype}") - return PandasColumn( - self.column.isin(values.column), api_version=self._api_version + def sorted_indices( + self, *, ascending: bool = True, nulls_position: Literal["first", "last"] = "last" + ) -> PandasColumn: + def func(ser, _rhs): + if ascending: + return ( + ser.sort_values() + .index.to_series(name=self.output_name) + .reset_index(drop=True) + ) + return ( + ser.sort_values() + .index.to_series(name=self.output_name)[::-1] + .reset_index(drop=True) + ) + + return self._record_call( + func, + None, ) - def unique_indices(self, *, skip_nulls: bool = True) -> PandasColumn[Any]: - return PandasColumn( - self.column.drop_duplicates().index.to_series(), api_version=self._api_version + def is_in(self, values: Column | PermissiveColumn[Any]) -> PandasColumn: + return self._record_call( + lambda ser, other: ser.isin(other), + values, ) + def unique_indices(self, *, skip_nulls: bool = True) -> PandasColumn: + raise NotImplementedError("not yet supported") + def fill_nan( self, value: float | pd.NAType # type: ignore[name-defined] - ) -> PandasColumn[DType]: - ser = self.column.copy() - ser[cast("pd.Series[bool]", np.isnan(ser)).fillna(False).to_numpy(bool)] = value - return PandasColumn(ser, api_version=self._api_version) + ) -> PandasColumn: + def func(ser, _rhs): + ser = ser.copy() + ser[ + cast("pd.Series[bool]", np.isnan(ser)).fillna(False).to_numpy(bool) + ] = value + return ser + + return self._record_call( + func, + None, + ) def fill_null( self, value: Any, - ) -> PandasColumn[DType]: - ser = self.column.copy() - if is_extension_array_dtype(ser.dtype): - # crazy hack to preserve nan... - num = pd.Series( - np.where(np.isnan(ser).fillna(False), 0, ser.fillna(value)), - dtype=ser.dtype, - ) - other = pd.Series( - np.where(np.isnan(ser).fillna(False), 0, 1), dtype=ser.dtype - ) - ser = num / other - else: - ser = ser.fillna(value) - return PandasColumn(pd.Series(ser), api_version=self._api_version) + ) -> PandasColumn: + def func(ser, value): + ser = ser.copy() + if is_extension_array_dtype(ser.dtype): + # crazy hack to preserve nan... + num = pd.Series( + np.where(np.isnan(ser).fillna(False), 0, ser.fillna(value)), + dtype=ser.dtype, + ) + other = pd.Series( + np.where(np.isnan(ser).fillna(False), 0, 1), dtype=ser.dtype + ) + ser = num / other + else: + ser = ser.fillna(value) + return ser.rename(self.output_name) - def cumulative_sum(self, *, skip_nulls: bool = True) -> PandasColumn[DType]: - return PandasColumn(self.column.cumsum(), api_version=self._api_version) + return self._record_call( + lambda ser, _rhs: func(ser, value), + None, + ) - def cumulative_prod(self, *, skip_nulls: bool = True) -> PandasColumn[DType]: - return PandasColumn(self.column.cumprod(), api_version=self._api_version) + def cumulative_sum(self, *, skip_nulls: bool = True) -> PandasColumn: + return self._record_call( + lambda ser, _rhs: ser.cumsum(), + None, + ) - def cumulative_max(self, *, skip_nulls: bool = True) -> PandasColumn[DType]: - return PandasColumn(self.column.cummax(), api_version=self._api_version) + def cumulative_prod(self, *, skip_nulls: bool = True) -> PandasColumn: + return self._record_call( + lambda ser, _rhs: ser.cumprod(), + None, + ) - def cumulative_min(self, *, skip_nulls: bool = True) -> PandasColumn[DType]: - return PandasColumn(self.column.cummin(), api_version=self._api_version) + def cumulative_max(self, *, skip_nulls: bool = True) -> PandasColumn: + return self._record_call( + lambda ser, _rhs: ser.cummax(), + None, + ) - def to_array_object(self, dtype: str) -> Any: - if dtype not in _ARRAY_API_DTYPES: - raise ValueError( - f"Invalid dtype {dtype}. Expected one of {_ARRAY_API_DTYPES}" - ) - return self.column.to_numpy(dtype=dtype) + def cumulative_min(self, *, skip_nulls: bool = True) -> PandasColumn: + return self._record_call( + lambda ser, _rhs: ser.cummin(), + None, + ) - def rename(self, name: str | None) -> PandasColumn[DType]: - return PandasColumn(self.column.rename(name), api_version=self._api_version) + def rename(self, name: str) -> PandasColumn: + expr = self._record_call( + lambda ser, _rhs: ser.rename(name), None, output_name=name + ) + return expr class PandasGroupBy(GroupBy): @@ -451,6 +505,239 @@ def var( SUPPORTED_VERSIONS = frozenset((LATEST_API_VERSION, "2023.08-beta")) +class PandasPermissiveColumn(PermissiveColumn[DType]): + # private, not technically part of the standard + def __init__(self, column: pd.Series[Any], api_version: str) -> None: + self._name = column.name + self._series = column.reset_index(drop=True) + self._api_version = api_version + if api_version not in SUPPORTED_VERSIONS: + raise AssertionError( + "Unsupported API version, expected one of: " + f"{SUPPORTED_VERSIONS}. " + "Try updating dataframe-api-compat?" + ) + + def _to_expression(self) -> PandasColumn: + return PandasColumn( + root_names=[], + output_name=self.name, + base_call=lambda _df: self.column.rename(self.name), + ) + + def _reuse_expression_implementation(self, function_name, *args, **kwargs): + return ( + PandasDataFrame(pd.DataFrame(), api_version=self._api_version) + .select(getattr(self._to_expression(), function_name)(*args, **kwargs)) + .collect() + .get_column_by_name(self.name) + ) + + # In the standard + def __column_namespace__(self) -> Any: + return dataframe_api_compat.pandas_standard + + @property + def name(self) -> str: + return self._name + + @property + def column(self) -> pd.Series[Any]: + return self._series + + def len(self) -> int: + return len(self.column) + + def __iter__(self) -> NoReturn: + raise NotImplementedError() + + @property + def dtype(self) -> Any: + return dataframe_api_compat.pandas_standard.map_pandas_dtype_to_standard_dtype( + self.column.dtype.name + ) + + def get_rows(self, indices: PermissiveColumn[Any]) -> PandasColumn[DType]: + return self._reuse_expression_implementation("get_rows", indices) + + def slice_rows( + self, start: int | None, stop: int | None, step: int | None + ) -> PandasPermissiveColumn[DType]: + return self._reuse_expression_implementation( + "slice_rows", start=start, stop=stop, step=step + ) + + def filter(self, mask: Column | PermissiveColumn[Any]) -> PandasColumn[DType]: + return self._reuse_expression_implementation("filter", mask) + + def get_value(self, row: int) -> Any: + return self.column.iloc[row] + + def __eq__( # type: ignore[override] + self, other: PandasColumn[DType] | Any + ) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("__eq__", other) + + def __ne__( # type: ignore[override] + self, other: PermissiveColumn[DType] + ) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("__ne__", other) + + def __ge__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("__ge__", other) + + def __gt__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("__gt__", other) + + def __le__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("__le__", other) + + def __lt__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("__lt__", other) + + def __and__(self, other: PermissiveColumn[Bool] | bool) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("__and__", other) + + def __or__(self, other: PermissiveColumn[Bool] | bool) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("__or__", other) + + def __add__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[DType]: + return self._reuse_expression_implementation("__add__", other) + + def __sub__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[DType]: + return self._reuse_expression_implementation("__sub__", other) + + def __mul__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[Any]: + return self._reuse_expression_implementation("__mul__", other) + + def __truediv__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[Any]: + return self._reuse_expression_implementation("__truediv__", other) + + def __floordiv__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[Any]: + return self._reuse_expression_implementation("__floordiv__", other) + + def __pow__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[Any]: + return self._reuse_expression_implementation("__pow__", other) + + def __mod__(self, other: PermissiveColumn[DType] | Any) -> PandasColumn[Any]: + return self._reuse_expression_implementation("__mod__", other) + + def __divmod__( + self, other: PermissiveColumn[DType] | Any + ) -> tuple[PandasColumn[Any], PandasColumn[Any]]: + quotient = self // other + remainder = self - quotient * other + return quotient, remainder + + def __invert__(self: PandasColumn[Bool]) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("__invert__") + + # Reductions + # Can't reuse the expressions implementation here as these return scalars. + + def any(self, *, skip_nulls: bool = True) -> bool: + return self.column.any() + + def all(self, *, skip_nulls: bool = True) -> bool: + return self.column.all() + + def min(self, *, skip_nulls: bool = True) -> Any: + return self.column.min() + + def max(self, *, skip_nulls: bool = True) -> Any: + return self.column.max() + + def sum(self, *, skip_nulls: bool = True) -> Any: + return self.column.sum() + + def prod(self, *, skip_nulls: bool = True) -> Any: + return self.column.prod() + + def median(self, *, skip_nulls: bool = True) -> Any: + return self.column.median() + + def mean(self, *, skip_nulls: bool = True) -> Any: + return self.column.mean() + + def std(self, *, correction: int | float = 1.0, skip_nulls: bool = True) -> Any: + return self.column.std() + + def var(self, *, correction: int | float = 1.0, skip_nulls: bool = True) -> Any: + return self.column.var() + + # Transformations, defer to expressions impl + + def is_null(self) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("is_null") + + def is_nan(self) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("is_nan") + + def sorted_indices( + self, *, ascending: bool = True, nulls_position: Literal["first", "last"] = "last" + ) -> PandasColumn[Any]: + return self._reuse_expression_implementation( + "sorted_indices", ascending=ascending, nulls_position=nulls_position + ) + + def sort( + self, *, ascending: bool = True, nulls_position: Literal["first", "last"] = "last" + ) -> PandasColumn[Any]: + return self._reuse_expression_implementation( + "sort", ascending=ascending, nulls_position=nulls_position + ) + + def is_in(self, values: PermissiveColumn[DType]) -> PandasColumn[Bool]: + return self._reuse_expression_implementation("is_in", values) + + def unique_indices(self, *, skip_nulls: bool = True) -> PandasColumn[Any]: + raise NotImplementedError("not yet supported") + + def fill_nan( + self, value: float | pd.NAType # type: ignore[name-defined] + ) -> PandasColumn[DType]: + return self._reuse_expression_implementation("fill_nan", value) + + def fill_null( + self, + value: Any, + ) -> PandasColumn[DType]: + return self._reuse_expression_implementation("fill_null", value) + + def cumulative_sum(self, *, skip_nulls: bool = True) -> PandasColumn[DType]: + return self._reuse_expression_implementation( + "cumulative_sum", skip_nulls=skip_nulls + ) + + def cumulative_prod(self, *, skip_nulls: bool = True) -> PandasColumn[DType]: + return self._reuse_expression_implementation( + "cumulative_prod", skip_nulls=skip_nulls + ) + + def cumulative_max(self, *, skip_nulls: bool = True) -> PandasColumn[DType]: + return self._reuse_expression_implementation( + "cumulative_max", skip_nulls=skip_nulls + ) + + def cumulative_min(self, *, skip_nulls: bool = True) -> PandasColumn[DType]: + return self._reuse_expression_implementation( + "cumulative_min", skip_nulls=skip_nulls + ) + + def rename(self, name: str) -> PandasColumn[DType]: + self._name = name + return self._reuse_expression_implementation("rename", name=name) + + # Eager-only + + def to_array_object(self, dtype: str) -> Any: + if dtype not in _ARRAY_API_DTYPES: + raise ValueError( + f"Invalid dtype {dtype}. Expected one of {_ARRAY_API_DTYPES}" + ) + return self.column.to_numpy(dtype=dtype) + + class PandasDataFrame(DataFrame): # Not technically part of the standard @@ -468,13 +755,25 @@ def __init__(self, dataframe: pd.DataFrame, api_version: str) -> None: else: self._dataframe = dataframe.reset_index(drop=True) if api_version not in SUPPORTED_VERSIONS: - raise ValueError( + raise AssertionError( "Unsupported API version, expected one of: " f"{SUPPORTED_VERSIONS}. Got: {api_version}" "Try updating dataframe-api-compat?" ) self._api_version = api_version + def __repr__(self) -> str: # pragma: no cover + return self.dataframe.__repr__() + + @property + def schema(self) -> dict[str, Any]: + return { + column_name: dataframe_api_compat.pandas_standard.map_pandas_dtype_to_standard_dtype( + dtype.name + ) + for column_name, dtype in self.dataframe.dtypes.items() + } + def _validate_columns(self, columns: Sequence[str]) -> None: counter = collections.Counter(columns) for col, count in counter.items(): @@ -489,9 +788,6 @@ def _validate_columns(self, columns: Sequence[str]) -> None: f"of type {type(col)}" ) - def _validate_index(self, index: pd.Index) -> None: - pd.testing.assert_index_equal(self.dataframe.index, index) - def _validate_booleanness(self) -> None: if not ( (self.dataframe.dtypes == "bool") | (self.dataframe.dtypes == "boolean") @@ -504,98 +800,110 @@ def _validate_booleanness(self) -> None: def __dataframe_namespace__(self) -> Any: return dataframe_api_compat.pandas_standard + @property + def column_names(self) -> list[str]: + return self.dataframe.columns.tolist() + @property def dataframe(self) -> pd.DataFrame: return self._dataframe - def shape(self) -> tuple[int, int]: - return self.dataframe.shape - - def group_by(self, keys: Sequence[str]) -> PandasGroupBy: - if not isinstance(keys, collections.abc.Sequence): - raise TypeError(f"Expected sequence of strings, got: {type(keys)}") - if isinstance(keys, str): - raise TypeError("Expected sequence of strings, got: str") + def group_by(self, *keys: str) -> PandasGroupBy: for key in keys: if key not in self.column_names: raise KeyError(f"key {key} not present in DataFrame's columns") return PandasGroupBy(self.dataframe, keys, api_version=self._api_version) - def get_column_by_name(self, name: str) -> PandasColumn[DType]: - if not isinstance(name, str): - raise ValueError(f"Expected str, got: {type(name)}") - return PandasColumn(self.dataframe.loc[:, name], api_version=self._api_version) - - def select(self, names: Sequence[str]) -> PandasDataFrame: - if isinstance(names, str): - raise TypeError(f"Expected sequence of str, got {type(names)}") - self._validate_columns(names) + def _broadcast_and_concat(self, columns) -> pd.DataFrame: + columns = [self._resolve_expression(col) for col in columns] + lengths = [len(col) for col in columns] + if len(set(lengths)) > 1: + # need to broadcast + max_len = max(lengths) + for i, length in enumerate(lengths): + if length == 1: + columns[i] = pd.Series( + [columns[i][0]] * max_len, name=columns[i].name + ) + return pd.concat(columns, axis=1) + + def select(self, *columns: str | Column | PermissiveColumn[Any]) -> PandasDataFrame: + new_columns = [] + for name in columns: + if isinstance(name, str): + new_columns.append(self.dataframe.loc[:, name]) + else: + new_columns.append(self._resolve_expression(name)) return PandasDataFrame( - self.dataframe.loc[:, list(names)], api_version=self._api_version + self._broadcast_and_concat(new_columns), + api_version=self._api_version, ) - def get_rows(self, indices: Column[Any]) -> PandasDataFrame: + def get_rows(self, indices: Column) -> PandasDataFrame: return PandasDataFrame( - self.dataframe.iloc[indices.column, :], api_version=self._api_version + self.dataframe.iloc[self._resolve_expression(indices), :], + api_version=self._api_version, ) def slice_rows( self, start: int | None, stop: int | None, step: int | None ) -> PandasDataFrame: - if start is None: - start = 0 - if stop is None: - stop = len(self.dataframe) - if step is None: - step = 1 return PandasDataFrame( self.dataframe.iloc[start:stop:step], api_version=self._api_version ) - def filter(self, mask: Column[Bool]) -> PandasDataFrame: - series = mask.column - self._validate_index(series.index) - return PandasDataFrame( - self.dataframe.loc[series, :], api_version=self._api_version - ) - - def insert(self, loc: int, label: str, value: Column[Any]) -> PandasDataFrame: - series = value.column - self._validate_index(series.index) - before = self.dataframe.iloc[:, :loc] - after = self.dataframe.iloc[:, loc:] - to_insert = value.column.rename(label) - return PandasDataFrame( - pd.concat([before, to_insert, after], axis=1), api_version=self._api_version - ) - - def insert_column(self, value: Column[Any]) -> PandasDataFrame: - series = value.column - self._validate_index(series.index) - before = self.dataframe - to_insert = value.column - return PandasDataFrame( - pd.concat([before, to_insert], axis=1), api_version=self._api_version - ) + def _broadcast(self, lhs, rhs): + if ( + isinstance(lhs, pd.Series) + and isinstance(rhs, pd.Series) + and len(lhs) != 1 + and len(rhs) == 1 + ): + rhs = pd.Series([rhs[0]] * len(lhs), name=rhs.name) + elif ( + isinstance(lhs, pd.Series) + and isinstance(rhs, pd.Series) + and len(lhs) == 1 + and len(rhs) != 1 + ): + lhs = pd.Series([lhs[0]] * len(rhs), name=lhs.name) + return lhs, rhs + + def _resolve_expression( + self, expression: PandasColumn | PandasPermissiveColumn | pd.Series | object + ) -> pd.Series: + if isinstance(expression, PandasPermissiveColumn): + return expression.column + if not isinstance(expression, PandasColumn): + # e.g. scalar + return expression + if not expression._calls: + return expression._base_call(self.dataframe) + output_name = expression.output_name + for func, lhs, rhs in expression._calls: + lhs = self._resolve_expression(lhs) + rhs = self._resolve_expression(rhs) + lhs, rhs = self._broadcast(lhs, rhs) + expression = func(lhs, rhs) + assert output_name == expression.name, f"{output_name} != {expression.name}" + return expression + + def filter(self, mask: Column | PermissiveColumn[Any]) -> PandasDataFrame: + df = self.dataframe + df = df.loc[self._resolve_expression(mask)] + return PandasDataFrame(df, api_version=self._api_version) - def update_columns(self, columns: PandasColumn[Any] | Sequence[PandasColumn[Any]], /) -> PandasDataFrame: # type: ignore[override] - if isinstance(columns, PandasColumn): - columns = [columns] - df = self.dataframe.copy() + def assign(self, *columns: Column | PermissiveColumn[Any]) -> PandasDataFrame: + df = self.dataframe.copy() # todo: remove defensive copy with CoW? for col in columns: - self._validate_index(col.column.index) - if col.name not in df.columns: - raise ValueError( - f"column {col.name} not in dataframe, use insert instead" - ) - df[col.name] = col.column + new_column = self._resolve_expression(col) + new_column, _ = self._broadcast(new_column, df.index.to_series()) + df[new_column.name] = new_column return PandasDataFrame(df, api_version=self._api_version) - def drop_column(self, label: str) -> PandasDataFrame: - if not isinstance(label, str): - raise TypeError(f"Expected str, got: {type(label)}") + def drop_columns(self, *labels: str) -> PandasDataFrame: return PandasDataFrame( - self.dataframe.drop(label, axis=1), api_version=self._api_version + self.dataframe.drop(list(labels), axis=1), api_version=self._api_version ) def rename_columns(self, mapping: Mapping[str, str]) -> PandasDataFrame: @@ -611,58 +919,17 @@ def get_column_names(self) -> list[str]: # pragma: no cover # just leave it in for backwards compatibility return self.dataframe.columns.tolist() - @property - def column_names(self) -> list[str]: - return self.dataframe.columns.tolist() - - @property - def schema(self) -> dict[str, Any]: - return { - column_name: dataframe_api_compat.pandas_standard.DTYPE_MAP[dtype.name] # type: ignore[misc] - for column_name, dtype in self.dataframe.dtypes.items() - } - - def sorted_indices( - self, - keys: Sequence[str] | None = None, - *, - ascending: Sequence[bool] | bool = True, - nulls_position: Literal["first", "last"] = "last", - ) -> PandasColumn[Any]: - if keys is None: - keys = self.dataframe.columns.tolist() - df = self.dataframe.loc[:, list(keys)] - if ascending: - return PandasColumn( - df.sort_values(keys).index.to_series(), api_version=self._api_version - ) - return PandasColumn( - df.sort_values(keys).index.to_series()[::-1], api_version=self._api_version - ) - def sort( self, - keys: Sequence[str] | None = None, - *, + *keys: str | Column | PermissiveColumn[Any], ascending: Sequence[bool] | bool = True, nulls_position: Literal["first", "last"] = "last", ) -> PandasDataFrame: - if keys is None: + if not keys: keys = self.dataframe.columns.tolist() df = self.dataframe return PandasDataFrame( - df.sort_values(keys, ascending=ascending), api_version=self._api_version - ) - - def unique_indices( - self, - keys: Sequence[str] | None = None, - *, - skip_nulls: bool = True, - ) -> PandasColumn[Any]: - return PandasColumn( - self.dataframe.drop_duplicates(subset=keys).index.to_series(), - api_version=self._api_version, + df.sort_values(list(keys), ascending=ascending), api_version=self._api_version ) def __eq__(self, other: Any) -> PandasDataFrame: # type: ignore[override] @@ -768,14 +1035,6 @@ def all(self, *, skip_nulls: bool = True) -> PandasDataFrame: self.dataframe.all().to_frame().T, api_version=self._api_version ) - def any_rowwise(self, *, skip_nulls: bool = True) -> PandasColumn[Bool]: - self._validate_booleanness() - return PandasColumn(self.dataframe.any(axis=1), api_version=self._api_version) - - def all_rowwise(self, *, skip_nulls: bool = True) -> PandasColumn[Bool]: - self._validate_booleanness() - return PandasColumn(self.dataframe.all(axis=1), api_version=self._api_version) - def min(self, *, skip_nulls: bool = True) -> PandasDataFrame: return PandasDataFrame( self.dataframe.min().to_frame().T, api_version=self._api_version @@ -879,7 +1138,7 @@ def fill_null( value: Any, *, column_names: list[str] | None = None, - ) -> PandasDataFrame: + ) -> PandasPermissiveFrame: if column_names is None: column_names = self.dataframe.columns.tolist() df = self.dataframe.copy() @@ -900,19 +1159,13 @@ def fill_null( df[column] = col return PandasDataFrame(df, api_version=self._api_version) - def to_array_object(self, dtype: str) -> Any: - if dtype not in _ARRAY_API_DTYPES: - raise ValueError( - f"Invalid dtype {dtype}. Expected one of {_ARRAY_API_DTYPES}" - ) - return self.dataframe.to_numpy(dtype=dtype) - def join( self, other: DataFrame, + *, + how: Literal["left", "inner", "outer"], left_on: str | list[str], right_on: str | list[str], - how: Literal["left", "inner", "outer"], ) -> PandasDataFrame: if how not in ["left", "inner", "outer"]: raise ValueError(f"Expected 'left', 'inner', 'outer', got: {how}") @@ -923,3 +1176,237 @@ def join( ), api_version=self._api_version, ) + + def collect(self) -> PandasPermissiveFrame: + return PandasPermissiveFrame(self.dataframe, api_version=self._api_version) + + +class PandasPermissiveFrame(PermissiveFrame): + # Not technically part of the standard + + def __init__(self, dataframe: pd.DataFrame, api_version: str) -> None: + # note: less validation is needed here, as the validation will already + # have happened in DataFrame, and PermissiveFrame can only be created from that. + self._dataframe = dataframe.reset_index(drop=True) + self._api_version = api_version + + def __repr__(self) -> str: # pragma: no cover + return self.dataframe.__repr__() + + def _reuse_dataframe_implementation(self, function_name, *args, **kwargs): + return getattr(self.relax(), function_name)(*args, **kwargs).collect() + + # In the standard + def __dataframe_namespace__(self) -> Any: + return dataframe_api_compat.pandas_standard + + @property + def column_names(self) -> list[str]: + return self.dataframe.columns.tolist() + + @property + def schema(self) -> dict[str, Any]: + return { + column_name: dataframe_api_compat.pandas_standard.map_pandas_dtype_to_standard_dtype( + dtype.name + ) + for column_name, dtype in self.dataframe.dtypes.items() + } + + @property + def dataframe(self) -> pd.DataFrame: + return self._dataframe + + def group_by(self, *keys: str) -> PandasGroupBy: + for key in keys: + if key not in self.get_column_names(): + raise KeyError(f"key {key} not present in DataFrame's columns") + return PandasGroupBy(self.dataframe, keys, api_version=self._api_version) + + def select( + self, *columns: str | Column | PermissiveColumn[Any] + ) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("select", *columns) + + def get_column_by_name(self, name) -> PandasColumn: + return PandasPermissiveColumn( + self.dataframe.loc[:, name], api_version=self._api_version + ) + + def get_rows(self, indices: Column | PermissiveColumn) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("get_rows", indices) + + def slice_rows( + self, start: int | None, stop: int | None, step: int | None + ) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation( + "slice_rows", start=start, stop=stop, step=step + ) + + def filter(self, mask: Column | PermissiveColumn) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("filter", mask) + + def assign(self, *columns: Column | PermissiveColumn) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("assign", *columns) + + def drop_columns(self, *labels: str) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("drop_columns", *labels) + + def rename_columns(self, mapping: Mapping[str, str]) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("rename_columns", mapping=mapping) + + def get_column_names(self) -> list[str]: + return self.dataframe.columns.tolist() + + def sort( + self, + *keys: str | Column | PermissiveColumn, + ascending: Sequence[bool] | bool = True, + nulls_position: Literal["first", "last"] = "last", + ) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation( + "sort", *keys, ascending=ascending, nulls_position=nulls_position + ) + + def __eq__(self, other: Any) -> PandasPermissiveFrame: # type: ignore[override] + return self._reuse_dataframe_implementation("__eq__", other) + + def __ne__(self, other: Any) -> PandasPermissiveFrame: # type: ignore[override] + return self._reuse_dataframe_implementation("__ne__", other) + + def __ge__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__ge__", other) + + def __gt__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__gt__", other) + + def __le__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__le__", other) + + def __lt__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__lt__", other) + + def __and__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__and__", other) + + def __or__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__or__", other) + + def __add__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__add__", other) + + def __sub__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__sub__", other) + + def __mul__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__mul__", other) + + def __truediv__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__truediv__", other) + + def __floordiv__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__floordiv__", other) + + def __pow__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__pow__", other) + + def __mod__(self, other: Any) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__mod__", other) + + def __divmod__( + self, + other: DataFrame | Any, + ) -> tuple[PandasPermissiveFrame, PandasPermissiveFrame]: + quotient, remainder = self.dataframe.__divmod__(other) + return PandasPermissiveFrame( + quotient, api_version=self._api_version + ), PandasPermissiveFrame(remainder, api_version=self._api_version) + + def __invert__(self) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("__invert__") + + def __iter__(self) -> NoReturn: + raise NotImplementedError() + + def any(self, *, skip_nulls: bool = True) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("any", skip_nulls=skip_nulls) + + def all(self, *, skip_nulls: bool = True) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("all", skip_nulls=skip_nulls) + + def min(self, *, skip_nulls: bool = True) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("min", skip_nulls=skip_nulls) + + def max(self, *, skip_nulls: bool = True) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("max", skip_nulls=skip_nulls) + + def sum(self, *, skip_nulls: bool = True) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("sum", skip_nulls=skip_nulls) + + def prod(self, *, skip_nulls: bool = True) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("prod", skip_nulls=skip_nulls) + + def median(self, *, skip_nulls: bool = True) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("median", skip_nulls=skip_nulls) + + def mean(self, *, skip_nulls: bool = True) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("mean", skip_nulls=skip_nulls) + + def std( + self, *, correction: int | float = 1.0, skip_nulls: bool = True + ) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation( + "std", correction=correction, skip_nulls=skip_nulls + ) + + def var( + self, *, correction: int | float = 1.0, skip_nulls: bool = True + ) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation( + "var", correction=correction, skip_nulls=skip_nulls + ) + + def is_null(self) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("is_null") + + def is_nan(self) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("is_nan") + + def fill_nan( + self, value: float | pd.NAType # type: ignore[name-defined] + ) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("fill_nan", value) + + def fill_null( + self, + value: Any, + *, + column_names: list[str] | None = None, + ) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation("fill_null", value) + + def to_array_object(self, dtype: str) -> Any: + if dtype not in _ARRAY_API_DTYPES: + raise ValueError( + f"Invalid dtype {dtype}. Expected one of {_ARRAY_API_DTYPES}" + ) + return self.dataframe.to_numpy(dtype=dtype) + + def join( + self, + other: PermissiveFrame, + *, + how: Literal["left", "inner", "outer"], + left_on: str | list[str], + right_on: str | list[str], + ) -> PandasPermissiveFrame: + return self._reuse_dataframe_implementation( + "join", + other=other.relax(), + left_on=left_on, + right_on=right_on, + how=how, + ) + + def relax(self) -> PandasDataFrame: + return PandasDataFrame(self.dataframe, api_version=self._api_version) diff --git a/dataframe_api_compat/polars_standard/__init__.py b/dataframe_api_compat/polars_standard/__init__.py index b593b9cd..721f5552 100644 --- a/dataframe_api_compat/polars_standard/__init__.py +++ b/dataframe_api_compat/polars_standard/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import Any +from typing import Literal from typing import TYPE_CHECKING from typing import TypeVar @@ -11,12 +12,18 @@ from dataframe_api_compat.polars_standard.polars_standard import PolarsColumn from dataframe_api_compat.polars_standard.polars_standard import PolarsDataFrame from dataframe_api_compat.polars_standard.polars_standard import PolarsGroupBy +from dataframe_api_compat.polars_standard.polars_standard import PolarsPermissiveColumn +from dataframe_api_compat.polars_standard.polars_standard import PolarsPermissiveFrame if TYPE_CHECKING: from collections.abc import Sequence + from dataframe_api._types import DType -Column = PolarsColumn +col = PolarsColumn +Column = col +PermissiveColumn = PolarsPermissiveColumn DataFrame = PolarsDataFrame +PermissiveFrame = PolarsPermissiveFrame GroupBy = PolarsGroupBy PolarsType = TypeVar("PolarsType", pl.DataFrame, pl.LazyFrame) @@ -70,20 +77,53 @@ class String: ... -DTYPE_MAP = { - pl.Int64(): Int64(), - pl.Int32(): Int32(), - pl.Int16(): Int16(), - pl.Int8(): Int8(), - pl.UInt64(): UInt64(), - pl.UInt32(): UInt32(), - pl.UInt16(): UInt16(), - pl.UInt8(): UInt8(), - pl.Float64(): Float64(), - pl.Float32(): Float32(), - pl.Boolean(): Bool(), - pl.Utf8(): String(), -} +class Date: + ... + + +class Datetime: + def __init__(self, time_unit, time_zone=None): + self.time_unit = time_unit + self.time_zone = time_zone + + +class Duration: + def __init__(self, time_unit): + self.time_unit = time_unit + + +def map_polars_dtype_to_standard_dtype(dtype: Any) -> DType: + if dtype == pl.Int64: + return Int64() + if dtype == pl.Int32: + return Int32() + if dtype == pl.Int16: + return Int16() + if dtype == pl.Int8: + return Int8() + if dtype == pl.UInt64: + return UInt64() + if dtype == pl.UInt32: + return UInt32() + if dtype == pl.UInt16: + return UInt16() + if dtype == pl.UInt8: + return UInt8() + if dtype == pl.Float64: + return Float64() + if dtype == pl.Float32: + return Float32() + if dtype == pl.Boolean: + return Bool() + if dtype == pl.Utf8: + return String() + if dtype == pl.Date: + return Date() + if isinstance(dtype, pl.Datetime): + return Datetime(dtype.time_unit, dtype.time_zone) + if isinstance(dtype, pl.Duration): + return Duration(dtype.time_unit) + raise AssertionError(f"Got invalid dtype: {dtype}") def is_null(value: Any) -> bool: @@ -115,6 +155,11 @@ def _map_standard_to_polars_dtypes(dtype: Any) -> pl.DataType: return pl.Boolean() if isinstance(dtype, String): return pl.Utf8() + if isinstance(dtype, Datetime): + return pl.Datetime(dtype.time_unit, dtype.time_zone) + if isinstance(dtype, Duration): # pragma: no cover + # pending fix in polars itself + return pl.Duration(dtype.time_unit) raise AssertionError(f"Unknown dtype: {dtype}") @@ -130,11 +175,11 @@ def concat(dataframes: Sequence[PolarsDataFrame]) -> PolarsDataFrame: def dataframe_from_dict( - data: dict[str, PolarsColumn[Any]], *, api_version: str | None = None + data: dict[str, PolarsPermissiveColumn[Any]], *, api_version: str | None = None ) -> PolarsDataFrame: for _, col in data.items(): - if not isinstance(col, PolarsColumn): # pragma: no cover - raise TypeError(f"Expected PolarsColumn, got {type(col)}") + if not isinstance(col, PolarsPermissiveColumn): # pragma: no cover + raise TypeError(f"Expected PolarsPermissiveColumn, got {type(col)}") if isinstance(col.column, pl.Expr): raise NotImplementedError( "dataframe_from_dict not supported for lazy columns" @@ -149,10 +194,23 @@ def dataframe_from_dict( def column_from_1d_array( data: Any, *, dtype: Any, name: str, api_version: str | None = None -) -> PolarsColumn[Any]: # pragma: no cover +) -> PolarsPermissiveColumn[Any]: # pragma: no cover ser = pl.Series(values=data, dtype=_map_standard_to_polars_dtypes(dtype), name=name) - return PolarsColumn( - ser, dtype=ser.dtype, id_=None, api_version=api_version or LATEST_API_VERSION + return PolarsPermissiveColumn(ser, api_version=api_version or LATEST_API_VERSION) + + +def column_from_sequence( + sequence: Sequence[Any], + *, + dtype: Any, + name: str | None = None, + api_version: str | None = None, +) -> PolarsPermissiveColumn[Any]: + return PolarsPermissiveColumn( + pl.Series( + values=sequence, dtype=_map_standard_to_polars_dtypes(dtype), name=name + ), + api_version=api_version or LATEST_API_VERSION, ) @@ -172,25 +230,8 @@ def dataframe_from_2d_array( return PolarsDataFrame(df, api_version=api_version or LATEST_API_VERSION) -def column_from_sequence( - sequence: Sequence[Any], - *, - dtype: Any, - name: str | None = None, - api_version: str | None = None, -) -> PolarsColumn[Any]: - return PolarsColumn( - pl.Series( - values=sequence, dtype=_map_standard_to_polars_dtypes(dtype), name=name - ), - dtype=_map_standard_to_polars_dtypes(dtype), - id_=None, - api_version=api_version or LATEST_API_VERSION, - ) - - def convert_to_standard_compliant_dataframe( - df: pl.DataFrame | pl.LazyFrame, api_version: str | None = None + df: pl.LazyFrame, api_version: str | None = None ) -> PolarsDataFrame: df_lazy = df.lazy() if isinstance(df, pl.DataFrame) else df return PolarsDataFrame(df_lazy, api_version=api_version or LATEST_API_VERSION) @@ -198,10 +239,8 @@ def convert_to_standard_compliant_dataframe( def convert_to_standard_compliant_column( ser: pl.Series, api_version: str | None = None -) -> PolarsColumn[Any]: # pragma: no cover (todo: is this even needed?) - return PolarsColumn( - ser, dtype=ser.dtype, id_=None, api_version=api_version or LATEST_API_VERSION - ) +) -> PolarsPermissiveColumn[Any]: # pragma: no cover (todo: is this even needed?) + return PolarsPermissiveColumn(ser, api_version=api_version or LATEST_API_VERSION) def is_dtype(dtype: Any, kind: str | tuple[str, ...]) -> bool: @@ -220,3 +259,26 @@ def is_dtype(dtype: Any, kind: str | tuple[str, ...]) -> bool: if _kind == "string": dtypes.add(String) return isinstance(dtype, tuple(dtypes)) + + +def any_rowwise(*columns: str, skip_nulls: bool = True): + return PolarsColumn(pl.any_horizontal(list(columns) or "*").alias("any")) + + +def all_rowwise(*columns: str, skip_nulls: bool = True): + return PolarsColumn(pl.all_horizontal(list(columns) or "*").alias("all")) + + +def sorted_indices( + keys: str | list[str] | None = None, + *, + ascending: Sequence[bool] | bool = True, + nulls_position: Literal["first", "last"] = "last", +) -> Column: + return PolarsColumn(pl.arg_sort_by(keys or "*", descending=not ascending)) + + +def unique_indices( + keys: str | list[str] | None = None, *, skip_nulls: bool = True +) -> Column: + raise NotImplementedError("namespace.unique_indices not implemented for polars yet") diff --git a/dataframe_api_compat/polars_standard/polars_standard.py b/dataframe_api_compat/polars_standard/polars_standard.py index 0e0c2812..c2a296be 100644 --- a/dataframe_api_compat/polars_standard/polars_standard.py +++ b/dataframe_api_compat/polars_standard/polars_standard.py @@ -5,7 +5,6 @@ from typing import Generic from typing import Literal from typing import NoReturn -from typing import Type from typing import TYPE_CHECKING from typing import TypeVar @@ -18,6 +17,8 @@ # and eager # BUT most things will probably work the same way? +col = None + _ARRAY_API_DTYPES = frozenset( ( "bool", @@ -39,33 +40,45 @@ from collections.abc import Mapping, Sequence from dataframe_api import ( - Bool, Column, + Bool, DataFrame, + PermissiveFrame, + PermissiveColumn, GroupBy, ) else: - class DataFrame(Generic[DType]): + class DataFrame: + ... + + class PermissiveFrame: + ... + + class PermissiveColumn(Generic[DType]): ... - class Column(Generic[DType]): + class Column: ... class GroupBy: ... + class Bool: + ... + class Null: ... null = Null() -NullType = Type[Null] +NullType = type[Null] def _is_integer_dtype(dtype: Any) -> bool: - return any( + return any( # pragma: no cover + # definitely covered, not sure what this is dtype is _dtype for _dtype in ( pl.Int64, @@ -84,40 +97,24 @@ def _is_integer_dtype(dtype: Any) -> bool: SUPPORTED_VERSIONS = frozenset((LATEST_API_VERSION, "2023.08-beta")) -class PolarsColumn(Column[DType]): +class PolarsPermissiveColumn(PermissiveColumn[DType]): def __init__( self, - column: pl.Series | pl.Expr, + column: pl.Series, *, - dtype: Any, - id_: int | None, # | None = None, api_version: str, ) -> None: if column is NotImplemented: raise NotImplementedError("operation not implemented") self._series = column - self._dtype = dtype - # keep track of which dataframe the column came from - self._id = id_ - if isinstance(column, pl.Series): - # just helps with defensiveness - assert column.dtype == dtype if api_version not in SUPPORTED_VERSIONS: - raise ValueError( + raise AssertionError( "Unsupported API version, expected one of: " f"{SUPPORTED_VERSIONS}. " "Try updating dataframe-api-compat?" ) self._api_version = api_version - - def _validate_column(self, column: PolarsColumn[Any] | Column[Any]) -> None: - assert isinstance(column, PolarsColumn) - if isinstance(column.column, pl.Expr) and column._id != self._id: - raise ValueError( - "Column was created from a different dataframe!", - column._id, - self._id, - ) + self._dtype = column.dtype # In the standard def __column_namespace__(self) -> Any: @@ -125,37 +122,30 @@ def __column_namespace__(self) -> Any: @property def name(self) -> str: - if isinstance(self.column, pl.Series): - return self.column.name - name = self.column.meta.output_name() - return name + return self.column.name @property def column(self) -> pl.Series | pl.Expr: return self._series - def __len__(self) -> int: - if isinstance(self.column, pl.Series): - return len(self.column) - raise NotImplementedError( - "__len__ intentionally not implemented for lazy columns" - ) + def len(self) -> int: + return len(self.column) @property def dtype(self) -> Any: - return dataframe_api_compat.polars_standard.DTYPE_MAP[self._dtype] + return dataframe_api_compat.polars_standard.map_polars_dtype_to_standard_dtype( + self._dtype + ) - def get_rows(self, indices: Column[Any]) -> PolarsColumn[DType]: - return PolarsColumn( + def get_rows(self, indices: PermissiveColumn[Any]) -> PolarsPermissiveColumn[DType]: + return PolarsPermissiveColumn( self.column.take(indices.column), - dtype=self._dtype, - id_=self._id, api_version=self._api_version, ) def slice_rows( self, start: int | None, stop: int | None, step: int | None - ) -> PolarsColumn[DType]: + ) -> PolarsPermissiveColumn[DType]: if isinstance(self.column, pl.Expr): raise NotImplementedError("slice_rows not implemented for lazy columns") if start is None: @@ -164,19 +154,16 @@ def slice_rows( stop = len(self.column) if step is None: step = 1 - return PolarsColumn( + return PolarsPermissiveColumn( self.column[start:stop:step], - dtype=self._dtype, - id_=self._id, api_version=self._api_version, ) - def filter(self, mask: Column[Bool]) -> PolarsColumn[DType]: - self._validate_column(mask) - return PolarsColumn( + def filter( + self, mask: Column | PermissiveColumn[Any] + ) -> PolarsPermissiveColumn[DType]: + return PolarsPermissiveColumn( self.column.filter(mask.column), - dtype=self._dtype, - id_=self._id, api_version=self._api_version, ) @@ -188,582 +175,263 @@ def get_value(self, row: int) -> Any: def __iter__(self) -> NoReturn: raise NotImplementedError() - def is_in(self, values: PolarsColumn[DType]) -> PolarsColumn[Bool]: # type: ignore[override] - self._validate_column(values) - if values.dtype != self.dtype: - raise ValueError(f"`value` has dtype {values.dtype}, expected {self.dtype}") - return PolarsColumn( - self.column.is_in(values.column), dtype=pl.Boolean(), id_=self._id, api_version=self._api_version # type: ignore[arg-type] + def is_in(self, values: PolarsPermissiveColumn[DType]) -> PolarsPermissiveColumn[Bool]: # type: ignore[override] + return PolarsPermissiveColumn( + self.column.is_in(values.column), api_version=self._api_version # type: ignore[arg-type] ) - def unique_indices(self, *, skip_nulls: bool = True) -> PolarsColumn[Any]: - if isinstance(self.column, pl.Expr): - raise NotImplementedError("unique_indices not implemented for lazy columns") - df = self.column.to_frame() - keys = df.columns - return PolarsColumn( - df.with_row_count().unique(keys).get_column("row_nr"), - dtype=pl.UInt32(), - id_=self._id, - api_version=self._api_version, - ) + def unique_indices(self, *, skip_nulls: bool = True) -> PolarsPermissiveColumn[Any]: + raise NotImplementedError("not yet supported") - def is_null(self) -> PolarsColumn[Bool]: - return PolarsColumn( - self.column.is_null(), - dtype=pl.Boolean(), - id_=self._id, - api_version=self._api_version, - ) + def is_null(self) -> PolarsPermissiveColumn[Bool]: + return self._from_expression(self._to_expression().is_null()) - def is_nan(self) -> PolarsColumn[Bool]: - return PolarsColumn( - self.column.is_nan(), - dtype=pl.Boolean(), - id_=self._id, - api_version=self._api_version, - ) + def is_nan(self) -> PolarsPermissiveColumn[Bool]: + return self._from_expression(self._to_expression().is_nan()) def any(self, *, skip_nulls: bool = True) -> bool | None: - if isinstance(self.column, pl.Expr): - raise NotImplementedError("any not implemented for lazy columns") return self.column.any() def all(self, *, skip_nulls: bool = True) -> bool | None: - if isinstance(self.column, pl.Expr): - raise NotImplementedError("all not implemented for lazy columns") return self.column.all() def min(self, *, skip_nulls: bool = True) -> Any: - if isinstance(self.column, pl.Expr): - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(pl.col("a").min()) - .schema["a"] - ) - return PolarsColumn( - self.column.min(), - id_=self._id, - dtype=res_dtype, - api_version=self._api_version, - ) return self.column.min() def max(self, *, skip_nulls: bool = True) -> Any: - if isinstance(self.column, pl.Expr): - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(pl.col("a").max()) - .schema["a"] - ) - return PolarsColumn( - self.column.max(), - id_=self._id, - dtype=res_dtype, - api_version=self._api_version, - ) return self.column.max() def sum(self, *, skip_nulls: bool = True) -> Any: - if isinstance(self.column, pl.Expr): - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(pl.col("a").sum()) - .schema["a"] - ) - return PolarsColumn( - self.column.sum(), - id_=self._id, - dtype=res_dtype, - api_version=self._api_version, - ) return self.column.sum() def prod(self, *, skip_nulls: bool = True) -> Any: - if isinstance(self.column, pl.Expr): - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(pl.col("a").product()) - .schema["a"] - ) - return PolarsColumn( - self.column.product(), - id_=self._id, - dtype=res_dtype, - api_version=self._api_version, - ) return self.column.product() def mean(self, *, skip_nulls: bool = True) -> Any: - if isinstance(self.column, pl.Expr): - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(pl.col("a").mean()) - .schema["a"] - ) - return PolarsColumn( - self.column.mean(), - id_=self._id, - dtype=res_dtype, - api_version=self._api_version, - ) return self.column.mean() def median(self, *, skip_nulls: bool = True) -> Any: - if isinstance(self.column, pl.Expr): - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(pl.col("a").median()) - .schema["a"] - ) - return PolarsColumn( - self.column.median(), - id_=self._id, - dtype=res_dtype, - api_version=self._api_version, - ) return self.column.median() def std(self, *, correction: int | float = 1.0, skip_nulls: bool = True) -> Any: - if isinstance(self.column, pl.Expr): - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(pl.col("a").std()) - .schema["a"] - ) - return PolarsColumn( - self.column.std(), - id_=self._id, - dtype=res_dtype, - api_version=self._api_version, - ) return self.column.std() def var(self, *, correction: int | float = 1.0, skip_nulls: bool = True) -> Any: - if isinstance(self.column, pl.Expr): - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(pl.col("a").var()) - .schema["a"] - ) - return PolarsColumn( - self.column.var(), - id_=self._id, - dtype=res_dtype, - api_version=self._api_version, - ) return self.column.var() def __eq__( # type: ignore[override] - self, other: Column[DType] | Any - ) -> PolarsColumn[Bool]: - if isinstance(other, PolarsColumn): - return PolarsColumn( - self.column == other.column, - dtype=pl.Boolean(), - id_=self._id, - api_version=self._api_version, + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Bool]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__eq__(other._to_expression()) ) - return PolarsColumn( - self.column == other, - dtype=pl.Boolean(), - id_=self._id, - api_version=self._api_version, - ) + return self._from_expression(self._to_expression().__eq__(other)) def __ne__( # type: ignore[override] - self, other: Column[DType] | Any - ) -> PolarsColumn[Bool]: - if isinstance(other, PolarsColumn): - return PolarsColumn( - self.column != other.column, - dtype=pl.Boolean(), - id_=self._id, - api_version=self._api_version, - ) - return PolarsColumn( - self.column != other, - dtype=pl.Boolean(), - id_=self._id, - api_version=self._api_version, - ) - - def __ge__(self, other: Column[DType] | Any) -> PolarsColumn[Bool]: - if isinstance(other, PolarsColumn): - return PolarsColumn( - self.column >= other.column, - dtype=pl.Boolean(), - id_=self._id, - api_version=self._api_version, - ) - return PolarsColumn( - self.column >= other, - dtype=pl.Boolean(), - id_=self._id, - api_version=self._api_version, - ) - - def __gt__(self, other: Column[DType] | Any) -> PolarsColumn[Bool]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - return PolarsColumn( - self.column > other.column, - id_=self._id, - dtype=pl.Boolean(), - api_version=self._api_version, + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Bool]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__ne__(other._to_expression()) ) - return PolarsColumn( - self.column > other, - id_=self._id, - dtype=pl.Boolean(), - api_version=self._api_version, - ) - - def __le__(self, other: Column[DType] | Any) -> PolarsColumn[Bool]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - return PolarsColumn( - self.column <= other.column, - id_=self._id, - dtype=pl.Boolean(), - api_version=self._api_version, - ) - return PolarsColumn( - self.column <= other, - id_=self._id, - dtype=pl.Boolean(), - api_version=self._api_version, - ) - - def __lt__(self, other: Column[DType] | Any) -> PolarsColumn[Bool]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - return PolarsColumn( - self.column < other.column, - id_=self._id, - dtype=pl.Boolean(), - api_version=self._api_version, - ) - return PolarsColumn( - self.column < other, - id_=self._id, - dtype=pl.Boolean(), - api_version=self._api_version, - ) - - def __mul__(self, other: Column[DType] | Any) -> PolarsColumn[Any]: - if isinstance(other, PolarsColumn): - res = self.column * other.column - res_dtype = ( - pl.DataFrame( - {"a": [1], "b": [1]}, schema={"a": self._dtype, "b": other._dtype} - ) - .select(result=pl.col("a") * pl.col("b")) - .schema["result"] - ) - return PolarsColumn( - res, dtype=res_dtype, id_=self._id, api_version=self._api_version + return self._from_expression(self._to_expression().__ne__(other)) + + def __ge__( + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Bool]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__ge__(other._to_expression()) ) - res = self.column * other - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(result=pl.col("a") * other) - .schema["result"] - ) - return PolarsColumn( - res, dtype=res_dtype, id_=self._id, api_version=self._api_version - ) - - def __floordiv__(self, other: Column[DType] | Any) -> PolarsColumn[Any]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - res_dtype = ( - pl.DataFrame( - {"a": [1], "b": [1]}, schema={"a": self._dtype, "b": other._dtype} - ) - .select(result=pl.col("a") // other.column) - .schema["result"] + return self._from_expression(self._to_expression().__ge__(other)) + + def __gt__( + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Bool]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__gt__(other._to_expression()) ) - return PolarsColumn( - self.column // other.column, - dtype=res_dtype, - id_=self._id, - api_version=self._api_version, + return self._from_expression(self._to_expression().__gt__(other)) + + def __le__( + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Bool]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__le__(other._to_expression()) ) - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(result=pl.col("a") // other) - .schema["result"] - ) - return PolarsColumn( - self.column // other, - dtype=res_dtype, - id_=self._id, - api_version=self._api_version, - ) - - def __truediv__(self, other: Column[DType] | Any) -> PolarsColumn[Any]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - res = self.column / other.column - res_dtype = ( - pl.DataFrame( - {"a": [1], "b": [1]}, schema={"a": self._dtype, "b": other._dtype} - ) - .select(result=pl.col("a") / pl.col("b")) - .schema["result"] + return self._from_expression(self._to_expression().__le__(other)) + + def __lt__( + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Bool]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__lt__(other._to_expression()) ) - return PolarsColumn( - res, dtype=res_dtype, id_=self._id, api_version=self._api_version + return self._from_expression(self._to_expression().__lt__(other)) + + def __mul__( + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Any]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__mul__(other._to_expression()) ) - res = self.column / other - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(result=pl.col("a") / other) - .schema["result"] - ) - return PolarsColumn( - res, dtype=res_dtype, id_=self._id, api_version=self._api_version - ) - - def __pow__(self, other: Column[DType] | Any) -> PolarsColumn[Any]: - original_type = self._dtype - if isinstance(other, PolarsColumn): - ret = self.column**other.column # type: ignore[operator] - ret_type = ( - pl.DataFrame( - {"a": [1], "b": [1]}, schema={"a": original_type, "b": other._dtype} - ) - .select(result=pl.col("a") ** pl.col("b")) - .schema["result"] + return self._from_expression(self._to_expression().__mul__(other)) + + def __floordiv__( + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Any]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__floordiv__(other._to_expression()) ) - if _is_integer_dtype(original_type) and _is_integer_dtype(other._dtype): - ret_type = original_type - ret = ret.cast(ret_type) - else: - ret = self.column.pow(other) # type: ignore[arg-type] - ret_type = ( - pl.DataFrame({"a": [1]}, schema={"a": original_type}) - .select(result=pl.col("a") ** other) # type: ignore[operator] - .schema["result"] + return self._from_expression(self._to_expression().__floordiv__(other)) + + def __truediv__( + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Any]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__truediv__(other._to_expression()) ) - if _is_integer_dtype(original_type) and isinstance(other, int): - ret_type = original_type - ret = ret.cast(ret_type) - return PolarsColumn( - ret, dtype=ret_type, id_=self._id, api_version=self._api_version - ) - - def __mod__(self, other: Column[DType] | Any) -> PolarsColumn[Any]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - res_dtype = ( - pl.DataFrame( - {"a": [1], "b": [1]}, schema={"a": self._dtype, "b": other._dtype} - ) - .select(result=pl.col("a") % other.column) - .schema["result"] + return self._from_expression(self._to_expression().__truediv__(other)) + + def __pow__( + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Any]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__pow__(other._to_expression()) ) - return PolarsColumn( - self.column % other.column, - dtype=res_dtype, - id_=self._id, - api_version=self._api_version, + return self._from_expression(self._to_expression().__pow__(other)) + + def __mod__( + self, other: PermissiveColumn[DType] | Any + ) -> PolarsPermissiveColumn[Any]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__mod__(other._to_expression()) ) - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(result=pl.col("a") % other) - .schema["result"] - ) - return PolarsColumn( - self.column % other, - dtype=res_dtype, - id_=self._id, - api_version=self._api_version, - ) + return self._from_expression(self._to_expression().__mod__(other)) def __divmod__( self, - other: Column[DType] | Any, - ) -> tuple[PolarsColumn[Any], PolarsColumn[Any]]: + other: PermissiveColumn[DType] | Any, + ) -> tuple[PolarsPermissiveColumn[Any], PolarsPermissiveColumn[Any]]: # validation happens in the deferred calls anyway quotient = self // other remainder = self - quotient * other return quotient, remainder - def __and__(self, other: Column[Bool] | bool) -> PolarsColumn[Bool]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - return PolarsColumn( - self.column & other.column, dtype=self._dtype, id_=self._id, api_version=self._api_version # type: ignore[operator] + def __and__( + self, other: PermissiveColumn[Bool] | bool + ) -> PolarsPermissiveColumn[Bool]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__and__(other._to_expression()) ) - return PolarsColumn(self.column & other, dtype=self._dtype, id_=self._id, api_version=self._api_version) # type: ignore[operator] - - def __or__(self, other: Column[Bool] | bool) -> PolarsColumn[Bool]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - return PolarsColumn( - self.column | other.column, dtype=self._dtype, id_=self._id, api_version=self._api_version # type: ignore[operator] + return self._from_expression(self._to_expression().__and__(other)) + + def __or__( + self, other: PermissiveColumn[Bool] | bool + ) -> PolarsPermissiveColumn[Bool]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__or__(other._to_expression()) ) - return PolarsColumn(self.column | other, dtype=self._dtype, id_=self._id, api_version=self._api_version) # type: ignore[operator] + return self._from_expression(self._to_expression().__or__(other)) - def __invert__(self) -> PolarsColumn[Bool]: - return PolarsColumn( - ~self.column, id_=self._id, dtype=self._dtype, api_version=self._api_version - ) + def __invert__(self) -> PolarsPermissiveColumn[Bool]: + return self._from_expression(self._to_expression().__invert__()) - def __add__(self, other: Column[Any] | Any) -> PolarsColumn[Any]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - res_dtype = ( - pl.DataFrame( - {"a": [1], "b": [1]}, schema={"a": self._dtype, "b": other._dtype} - ) - .select(result=pl.col("a") + pl.col("b")) - .schema["result"] - ) - return PolarsColumn( - self.column + other.column, - dtype=res_dtype, - id_=self._id, - api_version=self._api_version, + def __add__(self, other: PermissiveColumn[Any] | Any) -> PolarsPermissiveColumn[Any]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__add__(other._to_expression()) ) - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(result=pl.col("a") + other) - .schema["result"] - ) - return PolarsColumn( - self.column + other, - dtype=res_dtype, - id_=self._id, - api_version=self._api_version, - ) + return self._from_expression(self._to_expression().__add__(other)) - def __sub__(self, other: Column[Any] | Any) -> PolarsColumn[Any]: - if isinstance(other, PolarsColumn): - self._validate_column(other) - res_dtype = ( - pl.DataFrame( - {"a": [1], "b": [1]}, schema={"a": self._dtype, "b": other._dtype} - ) - .select(result=pl.col("a") - pl.col("b")) - .schema["result"] + def __sub__(self, other: PermissiveColumn[Any] | Any) -> PolarsPermissiveColumn[Any]: + if isinstance(other, PermissiveColumn): + return self._from_expression( + self._to_expression().__sub__(other._to_expression()) ) - return PolarsColumn( - self.column - other.column, - dtype=res_dtype, - id_=self._id, - api_version=self._api_version, - ) - res_dtype = ( - pl.DataFrame({"a": [1]}, schema={"a": self._dtype}) - .select(result=pl.col("a") - other) - .schema["result"] - ) - return PolarsColumn( - self.column - other, - dtype=res_dtype, - id_=self._id, - api_version=self._api_version, - ) + return self._from_expression(self._to_expression().__sub__(other)) def sorted_indices( self, *, ascending: bool = True, nulls_position: Literal["first", "last"] = "last" - ) -> PolarsColumn[Any]: - # if isinstance(self.column, pl.Expr): - # raise NotImplementedError("sorted_indices not implemented for lazy columns") + ) -> PolarsPermissiveColumn[Any]: expr = self.column.arg_sort(descending=not ascending) - return PolarsColumn( + return PolarsPermissiveColumn( expr, - id_=self._id, - dtype=pl.UInt32(), api_version=self._api_version, ) def sort( self, *, ascending: bool = True, nulls_position: Literal["first", "last"] = "last" - ) -> PolarsColumn[Any]: - if self._api_version == "2023.08-beta": - raise NotImplementedError("dataframe.sort only available after 2023.08-beta") + ) -> PolarsPermissiveColumn[Any]: expr = self.column.sort(descending=not ascending) - return PolarsColumn( + return PolarsPermissiveColumn( expr, - id_=self._id, - dtype=self._dtype, api_version=self._api_version, ) - def fill_nan(self, value: float | NullType) -> PolarsColumn[DType]: - return PolarsColumn(self.column.fill_nan(value), dtype=self._dtype, id_=self._id, api_version=self._api_version) # type: ignore[arg-type] + def fill_nan(self, value: float | NullType) -> PolarsPermissiveColumn[DType]: + return self._from_expression(self._to_expression().fill_nan(value)) - def fill_null(self, value: Any) -> PolarsColumn[DType]: - return PolarsColumn( - self.column.fill_null(value), - dtype=self._dtype, - id_=self._id, - api_version=self._api_version, - ) + def fill_null(self, value: Any) -> PolarsPermissiveColumn[DType]: + return self._from_expression(self._to_expression().fill_null(value)) - def cumulative_sum(self, *, skip_nulls: bool = True) -> PolarsColumn[DType]: - return PolarsColumn( - self.column.cumsum(), - dtype=self._dtype, - id_=self._id, - api_version=self._api_version, + def cumulative_sum(self, *, skip_nulls: bool = True) -> PolarsColumn: + return self._from_expression( + self._to_expression().cumulative_sum(skip_nulls=skip_nulls) ) - def cumulative_prod(self, *, skip_nulls: bool = True) -> PolarsColumn[DType]: - return PolarsColumn( - self.column.cumprod(), - dtype=self._dtype, - id_=self._id, - api_version=self._api_version, + def cumulative_prod(self, *, skip_nulls: bool = True) -> PolarsColumn: + return self._from_expression( + self._to_expression().cumulative_prod(skip_nulls=skip_nulls) ) - def cumulative_max(self, *, skip_nulls: bool = True) -> PolarsColumn[DType]: - return PolarsColumn( - self.column.cummax(), - dtype=self._dtype, - id_=self._id, - api_version=self._api_version, + def cumulative_max(self, *, skip_nulls: bool = True) -> PolarsColumn: + return self._from_expression( + self._to_expression().cumulative_max(skip_nulls=skip_nulls) ) - def cumulative_min(self, *, skip_nulls: bool = True) -> PolarsColumn[DType]: - return PolarsColumn( - self.column.cummin(), - dtype=self._dtype, - id_=self._id, - api_version=self._api_version, + def cumulative_min(self, *, skip_nulls: bool = True) -> PolarsColumn: + return self._from_expression( + self._to_expression().cumulative_min(skip_nulls=skip_nulls) ) def to_array_object(self, dtype: str) -> Any: - if isinstance(self.column, pl.Expr): - raise NotImplementedError("to_array_object not implemented for lazy columns") if dtype not in _ARRAY_API_DTYPES: raise ValueError( f"Invalid dtype {dtype}. Expected one of {_ARRAY_API_DTYPES}" ) return self.column.to_numpy().astype(dtype) - def rename(self, name: str) -> PolarsColumn[DType]: - if isinstance(self.column, pl.Series): - return PolarsColumn( - self.column.rename(name), - id_=self._id, - dtype=self._dtype, - api_version=self._api_version, - ) - return PolarsColumn( - self.column.alias(name), - id_=self._id, - dtype=self._dtype, + def rename(self, name: str) -> PolarsPermissiveColumn[DType]: + return PolarsPermissiveColumn( + self.column.rename(name), api_version=self._api_version, ) + def _to_expression(self) -> PolarsColumn: + return PolarsColumn(pl.lit(self.column), api_version=self._api_version) + + def _from_expression(self, expression: PolarsColumn): + df = pl.select(expression._expr) + return PolarsPermissiveColumn( + df.get_column(df.columns[0]), api_version=self._api_version + ) + class PolarsGroupBy(GroupBy): def __init__(self, df: pl.LazyFrame, keys: Sequence[str], api_version: str) -> None: + assert isinstance(df, pl.LazyFrame) for key in keys: if key not in df.columns: raise KeyError(f"key {key} not present in DataFrame's columns") @@ -837,113 +505,734 @@ def var( return PolarsDataFrame(result, api_version=self._api_version) -class PolarsDataFrame(DataFrame): - def __init__(self, df: pl.LazyFrame, api_version: str) -> None: - # columns already have to be strings, and duplicates aren't - # allowed, so no validation required - if df is NotImplemented: +class PolarsColumn: + def __init__( + self, + expr: pl.Series | pl.Expr, + *, + api_version: str | None = None, + ) -> None: + if expr is NotImplemented: raise NotImplementedError("operation not implemented") - self.df = df - self._id = id(df) - if api_version not in SUPPORTED_VERSIONS: - raise ValueError( - "Unsupported API version, expected one of: " - f"{SUPPORTED_VERSIONS}. " - "Try updating dataframe-api-compat?" - ) - self._api_version = api_version - - def _validate_column(self, column: PolarsColumn[Any]) -> None: - if isinstance(column.column, pl.Expr) and column._id != self._id: - raise ValueError( - "Column was created from a different dataframe!", - column._id, - self._id, - ) + if isinstance(expr, str): + self._expr = pl.col(expr) + else: + self._expr = expr + # need to pass this down from namespace.col + self._api_version = api_version or LATEST_API_VERSION - def __dataframe_namespace__(self) -> Any: + # In the standard + def __column_namespace__(self) -> Any: # pragma: no cover return dataframe_api_compat.polars_standard @property - def dataframe(self) -> pl.LazyFrame: - return self.df + def root_names(self) -> list[str]: + return sorted(set(self._expr.meta.root_names())) - def group_by(self, keys: Sequence[str]) -> PolarsGroupBy: - return PolarsGroupBy(self.df, keys, api_version=self._api_version) - - def get_column_by_name(self, name: str) -> PolarsColumn[DType]: - dtype = self.dataframe.schema[name] - return PolarsColumn( - pl.col(name), dtype=dtype, id_=self._id, api_version=self._api_version - ) + @property + def output_name(self) -> list[str]: + return self._expr.meta.output_name() - def select(self, names: Sequence[str]) -> PolarsDataFrame: - if isinstance(names, str): - raise TypeError(f"Expected sequence of str, got {type(names)}") - return PolarsDataFrame(self.df.select(names), api_version=self._api_version) + def len(self) -> PolarsColumn: + return PolarsColumn(self._expr.len(), api_version=self._api_version) - def get_rows(self, indices: PolarsColumn[Any]) -> PolarsDataFrame: # type: ignore[override] - self._validate_column(indices) - return PolarsDataFrame( - self.dataframe.select(pl.all().take(indices.column)), + def get_rows(self, indices: PolarsColumn) -> PolarsColumn: + return PolarsColumn( + self._expr.take(indices._expr), api_version=self._api_version, ) def slice_rows( self, start: int | None, stop: int | None, step: int | None - ) -> PolarsDataFrame: - return PolarsDataFrame(self.df[start:stop:step], api_version=self._api_version) - - def filter(self, mask: Column[Bool]) -> PolarsDataFrame: - self._validate_column(mask) # type: ignore[arg-type] - return PolarsDataFrame(self.df.filter(mask.column), api_version=self._api_version) - - def insert(self, loc: int, label: str, value: Column[Any]) -> PolarsDataFrame: - self._validate_column(value) # type: ignore[arg-type] - columns = self.dataframe.columns - new_columns = columns[:loc] + [label] + columns[loc:] - df = self.dataframe.with_columns(value.column.alias(label)).select(new_columns) - return PolarsDataFrame(df, api_version=self._api_version) - - def insert_column(self, value: Column[Any]) -> PolarsDataFrame: - self._validate_column(value) # type: ignore[arg-type] - columns = self.dataframe.columns - label = value.name - new_columns = [*columns, label] - df = self.dataframe.with_columns(value.column).select(new_columns) - return PolarsDataFrame(df, api_version=self._api_version) - - def update_columns(self, columns: PolarsColumn[Any] | Sequence[PolarsColumn[Any]], /) -> PolarsDataFrame: # type: ignore[override] - if isinstance(columns, PolarsColumn): - columns = [columns] - for col in columns: - self._validate_column(col) - if col.name not in self.dataframe.columns: - raise ValueError( - f"column {col.name} not in dataframe, please use insert_column instead" - ) - return PolarsDataFrame( - self.dataframe.with_columns([col.column for col in columns]), + ) -> PolarsColumn: + if start is None: + start = 0 + length = None if stop is None else stop - start + if step is None: + step = 1 + return PolarsColumn( + self._expr.slice(start, length).take_every(step), api_version=self._api_version, ) - def drop_column(self, label: str) -> PolarsDataFrame: - if not isinstance(label, str): - raise TypeError(f"Expected str, got: {type(label)}") - return PolarsDataFrame(self.dataframe.drop(label), api_version=self._api_version) + def filter(self, mask: PolarsColumn) -> PolarsColumn: + return PolarsColumn( + self._expr.filter(mask._expr), api_version=self._api_version # type: ignore[arg-type] + ) - def rename_columns(self, mapping: Mapping[str, str]) -> PolarsDataFrame: - if not isinstance(mapping, collections.abc.Mapping): - raise TypeError(f"Expected Mapping, got: {type(mapping)}") - return PolarsDataFrame( + def get_value(self, row: int) -> Any: + return PolarsColumn( + self._expr.take(row), api_version=self._api_version # type: ignore[arg-type] + ) + + def __iter__(self) -> NoReturn: + raise NotImplementedError() + + def is_in(self, values: PolarsColumn) -> PolarsColumn: # type: ignore[override] + return PolarsColumn( + self._expr.is_in(values._expr), api_version=self._api_version # type: ignore[arg-type] + ) + + def unique_indices(self, *, skip_nulls: bool = True) -> PolarsColumn: + raise NotImplementedError() + + def is_null(self) -> PolarsColumn: + return PolarsColumn( + self._expr.is_null(), + api_version=self._api_version, + ) + + def is_nan(self) -> PolarsColumn: + return PolarsColumn( + self._expr.is_nan(), + api_version=self._api_version, + ) + + def any(self, *, skip_nulls: bool = True) -> bool | None: + return PolarsColumn(self._expr.any(), api_version=self._api_version) + + def all(self, *, skip_nulls: bool = True) -> bool | None: + return PolarsColumn(self._expr.all(), api_version=self._api_version) + + def min(self, *, skip_nulls: bool = True) -> Any: + return PolarsColumn( + self._expr.min(), + api_version=self._api_version, + ) + + def max(self, *, skip_nulls: bool = True) -> Any: + return PolarsColumn( + self._expr.max(), + api_version=self._api_version, + ) + + def sum(self, *, skip_nulls: bool = True) -> Any: + return PolarsColumn( + self._expr.sum(), + api_version=self._api_version, + ) + + def prod(self, *, skip_nulls: bool = True) -> Any: + return PolarsColumn( + self._expr.product(), + api_version=self._api_version, + ) + + def mean(self, *, skip_nulls: bool = True) -> Any: + return PolarsColumn( + self._expr.mean(), + api_version=self._api_version, + ) + + def median(self, *, skip_nulls: bool = True) -> Any: + return PolarsColumn( + self._expr.median(), + api_version=self._api_version, + ) + + def std(self, *, correction: int | float = 1.0, skip_nulls: bool = True) -> Any: + return PolarsColumn( + self._expr.std(), + api_version=self._api_version, + ) + + def var(self, *, correction: int | float = 1.0, skip_nulls: bool = True) -> Any: + return PolarsColumn( + self._expr.var(), + api_version=self._api_version, + ) + + def __eq__(self, other: PolarsColumn | Any) -> PolarsColumn: # type: ignore[override] + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr == other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr == other, + api_version=self._api_version, + ) + + def __ne__(self, other: PolarsColumn | Any) -> PolarsColumn: # type: ignore[override] + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr != other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr != other, + api_version=self._api_version, + ) + + def __ge__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr >= other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr >= other, + api_version=self._api_version, + ) + + def __gt__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr > other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr > other, + api_version=self._api_version, + ) + + def __le__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr <= other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr <= other, + api_version=self._api_version, + ) + + def __lt__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr < other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr < other, + api_version=self._api_version, + ) + + def __mul__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + res = self._expr * other._expr + return PolarsColumn(res, api_version=self._api_version) + res = self._expr * other + return PolarsColumn(res, api_version=self._api_version) + + def __floordiv__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr // other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr // other, + api_version=self._api_version, + ) + + def __truediv__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + res = self._expr / other._expr + return PolarsColumn(res, api_version=self._api_version) + res = self._expr / other + return PolarsColumn(res, api_version=self._api_version) + + def __pow__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + ret = self._expr**other._expr # type: ignore[operator] + else: + ret = self._expr.pow(other) # type: ignore[arg-type] + return PolarsColumn(ret, api_version=self._api_version) + + def __mod__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr % other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr % other, + api_version=self._api_version, + ) + + def __divmod__( + self, + other: PolarsColumn | Any, + ) -> tuple[PolarsColumn, PolarsColumn]: + # validation happens in the deferred calls anyway + quotient = self // other + remainder = self - quotient * other + return quotient, remainder + + def __and__(self, other: PolarsColumn | bool) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn(self._expr & other._expr) + return PolarsColumn(self._expr & other) # type: ignore[operator] + + def __or__(self, other: PolarsColumn | bool) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn(self._expr | other._expr) + return PolarsColumn(self._expr | other) + + def __invert__(self) -> PolarsColumn: + return PolarsColumn(~self._expr, api_version=self._api_version) + + def __add__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr + other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr + other, + api_version=self._api_version, + ) + + def __sub__(self, other: PolarsColumn | Any) -> PolarsColumn: + if isinstance(other, PolarsColumn): + return PolarsColumn( + self._expr - other._expr, + api_version=self._api_version, + ) + return PolarsColumn( + self._expr - other, + api_version=self._api_version, + ) + + def sorted_indices( + self, *, ascending: bool = True, nulls_position: Literal["first", "last"] = "last" + ) -> PolarsColumn: + expr = self._expr.arg_sort(descending=not ascending) + return PolarsColumn( + expr, + api_version=self._api_version, + ) + + def sort( + self, *, ascending: bool = True, nulls_position: Literal["first", "last"] = "last" + ) -> PolarsColumn: + expr = self._expr.sort(descending=not ascending) + return PolarsColumn( + expr, + api_version=self._api_version, + ) + + def fill_nan(self, value: float | NullType) -> PolarsColumn: + return PolarsColumn(self._expr.fill_nan(value), api_version=self._api_version) # type: ignore[arg-type] + + def fill_null(self, value: Any) -> PolarsColumn: + return PolarsColumn( + self._expr.fill_null(value), + api_version=self._api_version, + ) + + def cumulative_sum(self, *, skip_nulls: bool = True) -> PolarsColumn: + return PolarsColumn( + self._expr.cumsum(), + api_version=self._api_version, + ) + + def cumulative_prod(self, *, skip_nulls: bool = True) -> PolarsColumn: + return PolarsColumn( + self._expr.cumprod(), + api_version=self._api_version, + ) + + def cumulative_max(self, *, skip_nulls: bool = True) -> PolarsColumn: + return PolarsColumn( + self._expr.cummax(), + api_version=self._api_version, + ) + + def cumulative_min(self, *, skip_nulls: bool = True) -> PolarsColumn: + return PolarsColumn( + self._expr.cummin(), + api_version=self._api_version, + ) + + def rename(self, name: str) -> PolarsColumn: + return PolarsColumn( + self._expr.alias(name), + api_version=self._api_version, + ) + + +class PolarsDataFrame(DataFrame): + def __init__(self, df: pl.LazyFrame, api_version: str) -> None: + # columns already have to be strings, and duplicates aren't + # allowed, so no validation required + if df is NotImplemented: + raise NotImplementedError("operation not implemented") + assert isinstance(df, pl.LazyFrame) + self.df = df + self._id = id(df) + if api_version not in SUPPORTED_VERSIONS: + raise AssertionError( + "Unsupported API version, expected one of: " + f"{SUPPORTED_VERSIONS}. " + "Try updating dataframe-api-compat?" + ) + self._api_version = api_version + + @property + def schema(self) -> dict[str, Any]: + return { + column_name: dataframe_api_compat.polars_standard.map_polars_dtype_to_standard_dtype( + dtype + ) + for column_name, dtype in self.dataframe.schema.items() + } + + def __repr__(self) -> str: # pragma: no cover + return self.dataframe.__repr__() + + def __dataframe_namespace__(self) -> Any: + return dataframe_api_compat.polars_standard + + @property + def column_names(self) -> list[str]: + return self.dataframe.columns + + @property + def dataframe(self) -> pl.LazyFrame: + return self.df + + def group_by(self, *keys: str) -> PolarsGroupBy: + return PolarsGroupBy(self.df, list(keys), api_version=self._api_version) + + def select(self, *columns: str | Column | PermissiveColumn[Any]) -> PolarsDataFrame: + resolved_names = [] + for name in columns: + if isinstance(name, PolarsColumn): + resolved_names.append(name._expr) + elif isinstance(name, str): + resolved_names.append(name) + else: + raise AssertionError(f"Expected str or PolarsColumn, got: {type(name)}") + return PolarsDataFrame( + self.df.select(resolved_names), api_version=self._api_version + ) + + def get_rows(self, indices: PolarsColumn) -> PolarsDataFrame: # type: ignore[override] + return PolarsDataFrame( + self.dataframe.select(pl.all().take(indices._expr)), + api_version=self._api_version, + ) + + def slice_rows( + self, start: int | None, stop: int | None, step: int | None + ) -> PolarsDataFrame: + return PolarsDataFrame(self.df[start:stop:step], api_version=self._api_version) + + def filter(self, mask: Column | PermissiveColumn[Any]) -> PolarsDataFrame: + return PolarsDataFrame(self.df.filter(mask._expr), api_version=self._api_version) + + def assign(self, *columns: Column | PermissiveColumn[Any]) -> PolarsDataFrame: + new_columns = [] + for col in columns: + if isinstance(col, PolarsColumn): + new_columns.append(col._expr) + elif isinstance(col, PolarsPermissiveColumn): + new_columns.append(col.column) + else: + raise AssertionError( + f"Expected PolarsColumn or PolarsPermissiveColumn, got: {type(col)}" + ) + df = self.dataframe.with_columns(new_columns) + return PolarsDataFrame(df, api_version=self._api_version) + + def drop_columns(self, *labels: str) -> PolarsDataFrame: + return PolarsDataFrame(self.dataframe.drop(labels), api_version=self._api_version) + + def rename_columns(self, mapping: Mapping[str, str]) -> PolarsDataFrame: + if not isinstance(mapping, collections.abc.Mapping): + raise TypeError(f"Expected Mapping, got: {type(mapping)}") + return PolarsDataFrame( self.dataframe.rename(dict(mapping)), api_version=self._api_version ) - def get_column_names(self) -> list[str]: # pragma: no cover - # DO NOT REMOVE - # This one is used in upstream tests - even if deprecated, - # just leave it in for backwards compatibility - return self.dataframe.columns + def get_column_names(self) -> list[str]: # pragma: no cover + # DO NOT REMOVE + # This one is used in upstream tests - even if deprecated, + # just leave it in for backwards compatibility + return self.dataframe.columns + + def __eq__( # type: ignore[override] + self, + other: Any, + ) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__eq__(other)), + api_version=self._api_version, + ) + + def __ne__( # type: ignore[override] + self, + other: Any, + ) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__ne__(other)), + api_version=self._api_version, + ) + + def __ge__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__ge__(other)), + api_version=self._api_version, + ) + + def __gt__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__gt__(other)), + api_version=self._api_version, + ) + + def __le__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__le__(other)), + api_version=self._api_version, + ) + + def __lt__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__lt__(other)), + api_version=self._api_version, + ) + + def __and__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*") & other), + api_version=self._api_version, + ) + + def __or__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns( + (pl.col(col) | other).alias(col) for col in self.dataframe.columns + ), + api_version=self._api_version, + ) + + def __add__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__add__(other)), + api_version=self._api_version, + ) + + def __sub__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__sub__(other)), + api_version=self._api_version, + ) + + def __mul__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__mul__(other)), + api_version=self._api_version, + ) + + def __truediv__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__truediv__(other)), + api_version=self._api_version, + ) + + def __floordiv__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").__floordiv__(other)), + api_version=self._api_version, + ) + + def __pow__(self, other: Any) -> PolarsDataFrame: + original_type = self.dataframe.schema + ret = self.dataframe.select([pl.col(col).pow(other) for col in self.column_names]) + for column in self.dataframe.columns: + if _is_integer_dtype(original_type[column]) and isinstance(other, int): + if other < 0: # pragma: no cover (todo) + raise ValueError("Cannot raise integer to negative power") + ret = ret.with_columns(pl.col(column).cast(original_type[column])) + return PolarsDataFrame(ret, api_version=self._api_version) + + def __mod__(self, other: Any) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*") % other), + api_version=self._api_version, + ) + + def __divmod__( + self, + other: DataFrame | Any, + ) -> tuple[PolarsDataFrame, PolarsDataFrame]: + quotient_df = self.dataframe.with_columns(pl.col("*") // other) + remainder_df = self.dataframe.with_columns( + pl.col("*") - (pl.col("*") // other) * other + ) + return PolarsDataFrame( + quotient_df, api_version=self._api_version + ), PolarsDataFrame(remainder_df, api_version=self._api_version) + + def __invert__(self) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(~pl.col("*")), api_version=self._api_version + ) + + def __iter__(self) -> NoReturn: + raise NotImplementedError() + + def is_null(self) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.with_columns(pl.col("*").is_null()), + api_version=self._api_version, + ) + + def is_nan(self) -> PolarsDataFrame: + df = self.dataframe.with_columns(pl.col("*").is_nan()) + return PolarsDataFrame(df, api_version=self._api_version) + + def any(self, *, skip_nulls: bool = True) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").any()), api_version=self._api_version + ) + + def all(self, *, skip_nulls: bool = True) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").all()), api_version=self._api_version + ) + + def min(self, *, skip_nulls: bool = True) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").min()), api_version=self._api_version + ) + + def max(self, *, skip_nulls: bool = True) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").max()), api_version=self._api_version + ) + + def sum(self, *, skip_nulls: bool = True) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").sum()), api_version=self._api_version + ) + + def prod(self, *, skip_nulls: bool = True) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").product()), api_version=self._api_version + ) + + def mean(self, *, skip_nulls: bool = True) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").mean()), api_version=self._api_version + ) + + def median(self, *, skip_nulls: bool = True) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").median()), api_version=self._api_version + ) + + def std( + self, *, correction: int | float = 1.0, skip_nulls: bool = True + ) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").std()), api_version=self._api_version + ) + + def var( + self, *, correction: int | float = 1.0, skip_nulls: bool = True + ) -> PolarsDataFrame: + return PolarsDataFrame( + self.dataframe.select(pl.col("*").var()), api_version=self._api_version + ) + + def sort( + self, + *keys: str | Column | PermissiveColumn[Any], + ascending: Sequence[bool] | bool = True, + nulls_position: Literal["first", "last"] = "last", + ) -> PolarsDataFrame: + if not keys: + keys = self.dataframe.columns + # TODO: what if there's multiple `ascending`? + return PolarsDataFrame( + self.dataframe.sort(list(keys), descending=not ascending), + api_version=self._api_version, + ) + + def fill_nan( + self, + value: float | NullType, + ) -> PolarsDataFrame: + if isinstance(value, Null): + value = None + return PolarsDataFrame(self.dataframe.fill_nan(value), api_version=self._api_version) # type: ignore[arg-type] + + def fill_null( + self, + value: Any, + *, + column_names: list[str] | None = None, + ) -> PolarsDataFrame: + if column_names is None: + column_names = self.dataframe.columns + df = self.dataframe.with_columns( + pl.col(col).fill_null(value) for col in column_names + ) + return PolarsDataFrame(df, api_version=self._api_version) + + def join( + self, + other: DataFrame, + *, + how: Literal["left", "inner", "outer"], + left_on: str | list[str], + right_on: str | list[str], + ) -> PolarsDataFrame: + if how not in ["left", "inner", "outer"]: + raise ValueError(f"Expected 'left', 'inner', 'outer', got: {how}") + + if isinstance(left_on, str): + left_on = [left_on] + if isinstance(right_on, str): + right_on = [right_on] + + # need to do some extra work to preserve all names + # https://github.com/pola-rs/polars/issues/9335 + extra_right_keys = set(right_on).difference(left_on) + assert isinstance(other, (PolarsDataFrame, PolarsPermissiveFrame)) + other_df = other.dataframe + # todo: make more robust + other_df = other_df.with_columns( + [pl.col(i).alias(f"{i}_tmp") for i in extra_right_keys] + ) + result = self.dataframe.join( + other_df, left_on=left_on, right_on=right_on, how=how + ) + result = result.rename({f"{i}_tmp": i for i in extra_right_keys}) + + return PolarsDataFrame(result, api_version=self._api_version) + + def collect(self) -> PolarsPermissiveFrame: + return PolarsPermissiveFrame( + self.dataframe.collect(), api_version=self._api_version + ) + + +class PolarsPermissiveFrame(PermissiveFrame): + def __init__(self, df: pl.LazyFrame, api_version: str) -> None: + # columns already have to be strings, and duplicates aren't + # allowed, so no validation required + if df is NotImplemented: + raise NotImplementedError("operation not implemented") + self.df = df + self._id = id(df) + if api_version not in SUPPORTED_VERSIONS: + raise AssertionError( + "Unsupported API version, expected one of: " + f"{SUPPORTED_VERSIONS}. " + "Try updating dataframe-api-compat?" + ) + self._api_version = api_version + + def __repr__(self) -> str: # pragma: no cover + return self.dataframe.__repr__() + + def __dataframe_namespace__(self) -> Any: + return dataframe_api_compat.polars_standard @property def column_names(self) -> list[str]: @@ -952,15 +1241,70 @@ def column_names(self) -> list[str]: @property def schema(self) -> dict[str, Any]: return { - column_name: dataframe_api_compat.polars_standard.DTYPE_MAP[dtype] # type: ignore[index] + column_name: dataframe_api_compat.polars_standard.map_polars_dtype_to_standard_dtype( + dtype + ) for column_name, dtype in self.dataframe.schema.items() } + @property + def dataframe(self) -> pl.LazyFrame: + return self.df + + def group_by(self, *keys: str) -> PolarsGroupBy: + return PolarsGroupBy(self.df.lazy(), list(keys), api_version=self._api_version) + + def select( + self, *columns: str | Column | PermissiveColumn[Any] + ) -> PolarsPermissiveFrame: + return self.relax().select(*columns).collect() + + def get_column_by_name(self, name) -> PolarsPermissiveColumn: + return PolarsPermissiveColumn( + self.dataframe.get_column(name), api_version=self._api_version + ) + + def get_rows(self, indices: PolarsColumn[Any]) -> PolarsDataFrame: # type: ignore[override] + return PolarsPermissiveFrame( + self.dataframe.select(pl.all().take(indices.column)), + api_version=self._api_version, + ) + + def slice_rows( + self, start: int | None, stop: int | None, step: int | None + ) -> PolarsDataFrame: + return PolarsPermissiveFrame( + self.df[start:stop:step], api_version=self._api_version + ) + + def filter(self, mask: PolarsPermissiveColumn | PolarsColumn) -> PolarsDataFrame: + if isinstance(mask, PolarsPermissiveColumn): + mask = mask._to_expression() + return PolarsPermissiveFrame( + self.df.filter(mask._expr), api_version=self._api_version + ) + + def assign(self, *columns: PolarsColumn | PolarsColumn) -> PolarsDataFrame: + return self.relax().assign(*columns).collect() + + def drop_columns(self, *labels: str) -> PolarsDataFrame: + return self.relax().drop_columns(*labels).collect() + + def rename_columns(self, mapping: Mapping[str, str]) -> PolarsDataFrame: + if not isinstance(mapping, collections.abc.Mapping): + raise TypeError(f"Expected Mapping, got: {type(mapping)}") + return PolarsPermissiveFrame( + self.dataframe.rename(dict(mapping)), api_version=self._api_version + ) + + def get_column_names(self) -> list[str]: + return self.dataframe.columns + def __eq__( # type: ignore[override] self, other: Any, ) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__eq__(other)), api_version=self._api_version, ) @@ -969,43 +1313,43 @@ def __ne__( # type: ignore[override] self, other: Any, ) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__ne__(other)), api_version=self._api_version, ) def __ge__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__ge__(other)), api_version=self._api_version, ) def __gt__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__gt__(other)), api_version=self._api_version, ) def __le__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__le__(other)), api_version=self._api_version, ) def __lt__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__lt__(other)), api_version=self._api_version, ) def __and__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*") & other), api_version=self._api_version, ) def __or__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns( (pl.col(col) | other).alias(col) for col in self.dataframe.columns ), @@ -1013,47 +1357,49 @@ def __or__(self, other: Any) -> PolarsDataFrame: ) def __add__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__add__(other)), api_version=self._api_version, ) def __sub__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__sub__(other)), api_version=self._api_version, ) def __mul__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__mul__(other)), api_version=self._api_version, ) def __truediv__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__truediv__(other)), api_version=self._api_version, ) def __floordiv__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").__floordiv__(other)), api_version=self._api_version, ) def __pow__(self, other: Any) -> PolarsDataFrame: original_type = self.dataframe.schema - ret = self.dataframe.select([pl.col(col).pow(other) for col in self.column_names]) + ret = self.dataframe.select( + [pl.col(col).pow(other) for col in self.get_column_names()] + ) for column in self.dataframe.columns: if _is_integer_dtype(original_type[column]) and isinstance(other, int): if other < 0: # pragma: no cover (todo) raise ValueError("Cannot raise integer to negative power") ret = ret.with_columns(pl.col(column).cast(original_type[column])) - return PolarsDataFrame(ret, api_version=self._api_version) + return PolarsPermissiveFrame(ret, api_version=self._api_version) def __mod__(self, other: Any) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*") % other), api_version=self._api_version, ) @@ -1062,18 +1408,16 @@ def __divmod__( self, other: DataFrame | Any, ) -> tuple[PolarsDataFrame, PolarsDataFrame]: - # quotient = self // other - # remainder = self - quotient * other quotient_df = self.dataframe.with_columns(pl.col("*") // other) remainder_df = self.dataframe.with_columns( pl.col("*") - (pl.col("*") // other) * other ) - return PolarsDataFrame( + return PolarsPermissiveFrame( quotient_df, api_version=self._api_version - ), PolarsDataFrame(remainder_df, api_version=self._api_version) + ), PolarsPermissiveFrame(remainder_df, api_version=self._api_version) def __invert__(self) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(~pl.col("*")), api_version=self._api_version ) @@ -1081,128 +1425,88 @@ def __iter__(self) -> NoReturn: raise NotImplementedError() def is_null(self) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.with_columns(pl.col("*").is_null()), api_version=self._api_version, ) def is_nan(self) -> PolarsDataFrame: df = self.dataframe.with_columns(pl.col("*").is_nan()) - return PolarsDataFrame(df, api_version=self._api_version) + return PolarsPermissiveFrame(df, api_version=self._api_version) def any(self, *, skip_nulls: bool = True) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").any()), api_version=self._api_version ) def all(self, *, skip_nulls: bool = True) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").all()), api_version=self._api_version ) - def any_rowwise(self, *, skip_nulls: bool = True) -> PolarsColumn[Bool]: - expr = pl.any_horizontal(pl.col("*")) - return PolarsColumn( - expr, id_=self._id, dtype=pl.Boolean(), api_version=self._api_version - ) - - def all_rowwise(self, *, skip_nulls: bool = True) -> PolarsColumn[Bool]: - expr = pl.all_horizontal(pl.col("*")) - return PolarsColumn( - expr, id_=self._id, dtype=pl.Boolean(), api_version=self._api_version - ) - def min(self, *, skip_nulls: bool = True) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").min()), api_version=self._api_version ) def max(self, *, skip_nulls: bool = True) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").max()), api_version=self._api_version ) def sum(self, *, skip_nulls: bool = True) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").sum()), api_version=self._api_version ) def prod(self, *, skip_nulls: bool = True) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").product()), api_version=self._api_version ) def mean(self, *, skip_nulls: bool = True) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").mean()), api_version=self._api_version ) def median(self, *, skip_nulls: bool = True) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").median()), api_version=self._api_version ) def std( self, *, correction: int | float = 1.0, skip_nulls: bool = True ) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").std()), api_version=self._api_version ) def var( self, *, correction: int | float = 1.0, skip_nulls: bool = True ) -> PolarsDataFrame: - return PolarsDataFrame( + return PolarsPermissiveFrame( self.dataframe.select(pl.col("*").var()), api_version=self._api_version ) - def sorted_indices( - self, - keys: Sequence[Any] | None = None, - *, - ascending: Sequence[bool] | bool = True, - nulls_position: Literal["first", "last"] = "last", - ) -> PolarsColumn[Any]: - if keys is None: - keys = self.dataframe.columns - expr = pl.arg_sort_by(keys, descending=not ascending) - return PolarsColumn( - expr, - dtype=pl.UInt32(), - id_=self._id, - api_version=self._api_version, - ) - def sort( self, - keys: Sequence[Any] | None = None, - *, + *keys: str, ascending: Sequence[bool] | bool = True, nulls_position: Literal["first", "last"] = "last", ) -> PolarsDataFrame: - if keys is None: - keys = self.dataframe.columns - # TODO: what if there's multiple `ascending`? - return PolarsDataFrame( - self.dataframe.sort(keys, descending=not ascending), - api_version=self._api_version, + return ( + self.relax() + .sort(*keys, ascending=ascending, nulls_position=nulls_position) + .collect() ) - def unique_indices( - self, keys: Sequence[str] | None = None, *, skip_nulls: bool = True - ) -> PolarsColumn[Any]: - df = self.dataframe - if keys is None: - keys = df.columns - raise NotImplementedError("unique_indices is not yet supported for lazyframes") - def fill_nan( self, value: float | NullType, ) -> PolarsDataFrame: if isinstance(value, Null): value = None - return PolarsDataFrame(self.dataframe.fill_nan(value), api_version=self._api_version) # type: ignore[arg-type] + return PolarsPermissiveFrame(self.dataframe.fill_nan(value), api_version=self._api_version) # type: ignore[arg-type] def fill_null( self, @@ -1215,43 +1519,28 @@ def fill_null( df = self.dataframe.with_columns( pl.col(col).fill_null(value) for col in column_names ) - return PolarsDataFrame(df, api_version=self._api_version) + return PolarsPermissiveFrame(df, api_version=self._api_version) def to_array_object(self, dtype: str) -> Any: if dtype not in _ARRAY_API_DTYPES: raise ValueError( f"Invalid dtype {dtype}. Expected one of {_ARRAY_API_DTYPES}" ) - # uurrggghhhh...we REALLY need to change this - return self.dataframe.collect().to_numpy().astype(dtype) + return self.dataframe.to_numpy().astype(dtype) def join( self, - other: DataFrame, + other: PolarsPermissiveFrame, + *, + how: Literal["left", "inner", "outer"], left_on: str | list[str], right_on: str | list[str], - how: Literal["left", "inner", "outer"], ) -> PolarsDataFrame: - if how not in ["left", "inner", "outer"]: - raise ValueError(f"Expected 'left', 'inner', 'outer', got: {how}") - - if isinstance(left_on, str): - left_on = [left_on] - if isinstance(right_on, str): - right_on = [right_on] - - # need to do some extra work to preserve all names - # https://github.com/pola-rs/polars/issues/9335 - extra_right_keys = set(right_on).difference(left_on) - assert isinstance(other, PolarsDataFrame) - other_df = other.dataframe - # todo: make more robust - other_df = other_df.with_columns( - [pl.col(i).alias(f"{i}_tmp") for i in extra_right_keys] - ) - result = self.dataframe.join( - other_df, left_on=left_on, right_on=right_on, how=how + return ( + self.relax() + .join(other.relax(), left_on=left_on, right_on=right_on, how=how) + .collect() ) - result = result.rename({f"{i}_tmp": i for i in extra_right_keys}) - return PolarsDataFrame(result, api_version=self._api_version) + def relax(self) -> PolarsDataFrame: + return PolarsDataFrame(self.dataframe.lazy(), api_version=self._api_version) diff --git a/make.sh b/make.sh index 929b005d..5bcf0446 100644 --- a/make.sh +++ b/make.sh @@ -1,3 +1,3 @@ -PYTHONPATH=../dataframe-api/spec/API_specification/ python check_completeness.py -. type-check.sh +#PYTHONPATH=../dataframe-api/spec/API_specification/ python check_completeness.py +#. type-check.sh pytest tests --cov=dataframe_api_compat --cov=tests --cov-fail-under=100 -W error diff --git a/pyproject.toml b/pyproject.toml index 2d2d8646..cdb6d423 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ [tool.ruff] line-length = 90 fix = true -target-version = "py38" +target-version = "py39" select = [ "E", # pycodestyle diff --git a/t.py b/t.py deleted file mode 100644 index 4b917563..00000000 --- a/t.py +++ /dev/null @@ -1,29 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import polars as pl - - -df_pandas = pd.read_parquet("iris.parquet") -df_polars = pl.scan_parquet("iris.parquet") - - -def my_dataframe_agnostic_function(df): - df = df.__dataframe_consortium_standard__(api_version="2023.09-beta") - - mask = df.get_column_by_name("species") != "setosa" - df = df.filter(mask) - - for column_name in df.column_names: - if column_name == "species": - continue - new_column = df.get_column_by_name(column_name) - new_column = (new_column - new_column.mean()) / new_column.std() - df = df.insert_column(new_column.rename(f"{column_name}_scaled")) - - return df.dataframe - - -# Then, either of the following will work as expected: -print(my_dataframe_agnostic_function(df_pandas)) -print(my_dataframe_agnostic_function(df_polars).collect()) diff --git a/tests/__init__.py b/tests/__init__.py index e69de29b..9d48db4f 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1 @@ +from __future__ import annotations diff --git a/tests/column/and_or_test.py b/tests/column/and_or_test.py index 496e6118..818af5b2 100644 --- a/tests/column/and_or_test.py +++ b/tests/column/and_or_test.py @@ -11,41 +11,52 @@ import pytest -def test_column_and(library: str, request: pytest.FixtureRequest) -> None: - df = bool_dataframe_1(library) +def test_column_and(library: str) -> None: + df = bool_dataframe_1(library, api_version="2023.09-beta").collect() ser = df.get_column_by_name("a") other = df.get_column_by_name("b") - result = df.insert(0, "result", ser & other) + result = df.assign((ser & other).rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series([True, True, False], name="result") + pd.testing.assert_series_equal(result_pd, expected) + + +def test_expression_and(library: str) -> None: + df = bool_dataframe_1(library, api_version="2023.09-beta") + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") + other = namespace.col("b") + result = df.assign((ser & other).rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([True, True, False], name="result") pd.testing.assert_series_equal(result_pd, expected) def test_column_or(library: str) -> None: - df = bool_dataframe_1(library) + df = bool_dataframe_1(library).collect() ser = df.get_column_by_name("a") other = df.get_column_by_name("b") - result = df.insert(0, "result", ser | other) + result = df.assign((ser | other).rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([True, True, True], name="result") pd.testing.assert_series_equal(result_pd, expected) def test_column_and_with_scalar(library: str, request: pytest.FixtureRequest) -> None: - df = bool_dataframe_1(library) + df = bool_dataframe_1(library).collect() ser = df.get_column_by_name("a") other = True - result = df.insert(0, "result", ser & other) + result = df.assign((ser & other).rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([True, True, False], name="result") pd.testing.assert_series_equal(result_pd, expected) def test_column_or_with_scalar(library: str, request: pytest.FixtureRequest) -> None: - df = bool_dataframe_1(library) + df = bool_dataframe_1(library).collect() ser = df.get_column_by_name("a") other = True - result = df.insert(0, "result", ser | other) + result = df.assign((ser | other).rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([True, True, True], name="result") pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/any_all_test.py b/tests/column/any_all_test.py index bafc9ba9..6b4a344e 100644 --- a/tests/column/any_all_test.py +++ b/tests/column/any_all_test.py @@ -1,6 +1,10 @@ from __future__ import annotations +import pandas as pd + +from tests.utils import bool_dataframe_1 from tests.utils import bool_series_1 +from tests.utils import interchange_to_pandas def test_column_any(library: str) -> None: @@ -13,3 +17,21 @@ def test_column_all(library: str) -> None: ser = bool_series_1(library) result = ser.all() assert not result + + +def test_expr_any(library: str) -> None: + df = bool_dataframe_1(library) + col = df.__dataframe_namespace__().col + result = df.select(col("a").any()) + result_pd = interchange_to_pandas(result, library) + expected = pd.DataFrame({"a": [True]}) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_expr_all(library: str) -> None: + df = bool_dataframe_1(library) + col = df.__dataframe_namespace__().col + result = df.select(col("a").all()) + result_pd = interchange_to_pandas(result, library) + expected = pd.DataFrame({"a": [False]}) + pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/column/sorted_indices_test.py b/tests/column/col_sorted_indices_test.py similarity index 51% rename from tests/column/sorted_indices_test.py rename to tests/column/col_sorted_indices_test.py index 17b38b21..cd3349ae 100644 --- a/tests/column/sorted_indices_test.py +++ b/tests/column/col_sorted_indices_test.py @@ -7,8 +7,42 @@ from tests.utils import interchange_to_pandas -def test_column_sorted_indices_ascending(library: str) -> None: +def test_expression_sorted_indices_ascending(library: str) -> None: + df = integer_dataframe_6(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + sorted_indices = col("b").sorted_indices() + result = df.get_rows(sorted_indices) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame( + { + "a": [2, 2, 1, 1, 1], + "b": [1, 2, 3, 4, 4], + } + ) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_expression_sorted_indices_descending(library: str) -> None: df = integer_dataframe_6(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + sorted_indices = col("b").sorted_indices(ascending=False) + result = df.get_rows(sorted_indices) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 2, 1], + } + ) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_column_sorted_indices_ascending(library: str) -> None: + df = integer_dataframe_6(library).collect() sorted_indices = df.get_column_by_name("b").sorted_indices() result = df.get_rows(sorted_indices) result_pd = interchange_to_pandas(result, library) @@ -23,7 +57,7 @@ def test_column_sorted_indices_ascending(library: str) -> None: def test_column_sorted_indices_descending(library: str) -> None: - df = integer_dataframe_6(library) + df = integer_dataframe_6(library).collect() sorted_indices = df.get_column_by_name("b").sorted_indices(ascending=False) result = df.get_rows(sorted_indices) result_pd = interchange_to_pandas(result, library) diff --git a/tests/column/col_to_array_object_test.py b/tests/column/col_to_array_object_test.py index dd21638a..16bfaa77 100644 --- a/tests/column/col_to_array_object_test.py +++ b/tests/column/col_to_array_object_test.py @@ -23,15 +23,8 @@ "float64", ], ) -def test_column_to_array_object( - library: str, dtype: str, request: pytest.FixtureRequest -) -> None: - df = integer_dataframe_1(library) - ser = df.get_column_by_name("a") - if library == "polars-lazy": - with pytest.raises(NotImplementedError): - result = np.asarray(ser.to_array_object(dtype=dtype)) - return +def test_column_to_array_object(library: str, dtype: str) -> None: + ser = integer_series_1(library) result = np.asarray(ser.to_array_object(dtype=dtype)) expected = np.array([1, 2, 3], dtype=np.int64) np.testing.assert_array_equal(result, expected) @@ -45,14 +38,10 @@ def test_column_to_array_object_bool(library: str) -> None: np.testing.assert_array_equal(result, expected) -def test_column_to_array_object_invalid( - library: str, request: pytest.FixtureRequest -) -> None: +def test_column_to_array_object_invalid(library: str) -> None: dtype = "object" df = integer_dataframe_1(library) with pytest.raises(ValueError): - np.asarray(df.to_array_object(dtype=dtype)) - with pytest.raises((ValueError, NotImplementedError)): - np.asarray(df.get_column_by_name("a").to_array_object(dtype=dtype)) + np.asarray(df.collect().to_array_object(dtype=dtype)) with pytest.raises(ValueError): np.asarray(integer_series_1(library).to_array_object(dtype=dtype)) diff --git a/tests/column/column_test.py b/tests/column/column_test.py index c342e1dd..14619095 100644 --- a/tests/column/column_test.py +++ b/tests/column/column_test.py @@ -1,25 +1,19 @@ from __future__ import annotations -from typing import cast - import pandas as pd -import polars as pl -from dataframe_api_compat import pandas_standard from tests.utils import integer_dataframe_1 -def test_column_column() -> None: - namespace = integer_dataframe_1("polars-lazy").__dataframe_namespace__() +def test_column_column(library: str) -> None: + namespace = integer_dataframe_1(library).__dataframe_namespace__() ser = namespace.column_from_sequence([1, 2, 3], name="a", dtype=namespace.Int64()) - result_pl = ser.column - result_pl = cast(pl.Series, result_pl) - pd.testing.assert_series_equal(result_pl.to_pandas(), pd.Series([1, 2, 3], name="a")) - result_pd = ( - pandas_standard.convert_to_standard_compliant_dataframe( - pd.DataFrame({"a": [1, 2, 3]}), "2023.08-beta" - ) - .get_column_by_name("a") - .column - ) - pd.testing.assert_series_equal(result_pd, pd.Series([1, 2, 3], name="a")) + result = ser.column + if library == "polars-lazy": + pd.testing.assert_series_equal(result.to_pandas(), pd.Series([1, 2, 3], name="a")) + elif library == "pandas-numpy": # noqa: SIM114 + pd.testing.assert_series_equal(result, pd.Series([1, 2, 3], name="a")) + elif library == "pandas-nullable": + pd.testing.assert_series_equal(result, pd.Series([1, 2, 3], name="a")) + else: + raise AssertionError() diff --git a/tests/column/comparisons_test.py b/tests/column/comparisons_test.py index e1180941..47d28de5 100644 --- a/tests/column/comparisons_test.py +++ b/tests/column/comparisons_test.py @@ -3,6 +3,7 @@ from typing import Any import pandas as pd +import polars as pl import pytest from tests.utils import integer_dataframe_1 @@ -34,12 +35,15 @@ def test_column_comparisons( expected_data: list[object], ) -> None: ser: Any - df = integer_dataframe_7(library) + df = integer_dataframe_7(library).collect() ser = df.get_column_by_name("a") other = df.get_column_by_name("b") - result = df.insert(0, "result", (getattr(ser, comparison)(other))) + result = df.assign(getattr(ser, comparison)(other).rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series(expected_data, name="result") + if library in ("polars", "polars-lazy") and comparison == "__pow__": + # TODO + result_pd = result_pd.astype("int64") pd.testing.assert_series_equal(result_pd, expected) @@ -67,17 +71,57 @@ def test_column_comparisons_scalar( expected_data: list[object], ) -> None: ser: Any - df = integer_dataframe_1(library) + df = integer_dataframe_1(library).collect() ser = df.get_column_by_name("a") other = 3 - result = df.insert(0, "result", (getattr(ser, comparison)(other))) + result = df.assign(getattr(ser, comparison)(other).rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series(expected_data, name="result") + if comparison == "__pow__" and library in ("polars", "polars-lazy"): + result_pd = result_pd.astype("int64") + pd.testing.assert_series_equal(result_pd, expected) + + +@pytest.mark.parametrize( + ("comparison", "expected_data"), + [ + ("__eq__", [False, False, True]), + ("__ne__", [True, True, False]), + ("__ge__", [False, False, True]), + ("__gt__", [False, False, False]), + ("__le__", [True, True, True]), + ("__lt__", [True, True, False]), + ("__add__", [4, 5, 6]), + ("__sub__", [-2, -1, 0]), + ("__mul__", [3, 6, 9]), + ("__truediv__", [1 / 3, 2 / 3, 1]), + ("__floordiv__", [0, 0, 1]), + ("__pow__", [1, 8, 27]), + ("__mod__", [1, 2, 0]), + ], +) +def test_expression_comparisons_scalar( + library: str, + comparison: str, + expected_data: list[object], +) -> None: + ser: Any + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") + other = 3 + result = df.assign(getattr(ser, comparison)(other).rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series(expected_data, name="result") + if comparison == "__pow__" and library in ("polars", "polars-lazy"): + result_pd = result_pd.astype("int64") pd.testing.assert_series_equal(result_pd, expected) -def test_invalid_comparison() -> None: - df1 = integer_dataframe_1("polars-lazy") - df2 = integer_dataframe_1("polars-lazy") - with pytest.raises(ValueError): - _ = df1.get_column_by_name("a") > df2.get_column_by_name("a") +def test_combine_column_and_expression(library: str) -> None: + df = integer_dataframe_1(library).collect() + namespace = df.__dataframe_namespace__() + ser = df.get_column_by_name("a") + other = namespace.col("b") + with pytest.raises((KeyError, AttributeError, TypeError, pl.ColumnNotFoundError)): + _ = ser > other diff --git a/tests/column/cumulative_test.py b/tests/column/cumulative_test.py index 64f15669..eae6617f 100644 --- a/tests/column/cumulative_test.py +++ b/tests/column/cumulative_test.py @@ -19,9 +19,9 @@ def test_cumulative_functions_column( library: str, func: str, expected_data: list[float], request: pytest.FixtureRequest ) -> None: - df = integer_dataframe_1(library) + df = integer_dataframe_1(library).collect() ser = df.get_column_by_name("a") expected = pd.Series(expected_data, name="result") - result = df.insert(0, "result", getattr(ser, func)()) + result = df.assign(getattr(ser, func)().rename("result")) result_pd = interchange_to_pandas(result, library)["result"] pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/divmod_test.py b/tests/column/divmod_test.py index ff137df8..9c956c25 100644 --- a/tests/column/divmod_test.py +++ b/tests/column/divmod_test.py @@ -3,38 +3,74 @@ import pandas as pd from tests.utils import integer_dataframe_1 +from tests.utils import integer_series_1 from tests.utils import interchange_to_pandas def test_column_divmod(library: str) -> None: - df = integer_dataframe_1(library) - ser = df.get_column_by_name("a") + df = integer_dataframe_1(library).collect() + ser = integer_series_1(library) other = df.get_column_by_name("b") result_quotient, result_remainder = ser.__divmod__(other) # quotient - result = df.insert(0, "result", result_quotient) + result = df.assign(result_quotient.rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected_quotient = pd.Series([0, 0, 0], name="result") pd.testing.assert_series_equal(result_pd, expected_quotient) # remainder - result = df.insert(0, "result", result_remainder) + result = df.assign(result_remainder.rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected_remainder = pd.Series([1, 2, 3], name="result") pd.testing.assert_series_equal(result_pd, expected_remainder) def test_column_divmod_with_scalar(library: str) -> None: - df = integer_dataframe_1(library) + df = integer_dataframe_1(library).collect() ser = df.get_column_by_name("a") df.get_column_by_name("b") result_quotient, result_remainder = ser.__divmod__(2) # quotient - result = df.insert(0, "result", result_quotient) + result = df.assign(result_quotient.rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected_quotient = pd.Series([0, 1, 1], name="result") + pd.testing.assert_series_equal(result_pd, expected_quotient) + # remainder + result = df.assign(result_remainder.rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected_remainder = pd.Series([1, 0, 1], name="result") + pd.testing.assert_series_equal(result_pd, expected_remainder) + + +def test_expression_divmod(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") + other = namespace.col("b") + result_quotient, result_remainder = ser.__divmod__(other) + # quotient + result = df.assign(result_quotient.rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected_quotient = pd.Series([0, 0, 0], name="result") + pd.testing.assert_series_equal(result_pd, expected_quotient) + # remainder + result = df.assign(result_remainder.rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected_remainder = pd.Series([1, 2, 3], name="result") + pd.testing.assert_series_equal(result_pd, expected_remainder) + + +def test_expression_divmod_with_scalar(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") + result_quotient, result_remainder = ser.__divmod__(2) + # quotient + result = df.assign(result_quotient.rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected_quotient = pd.Series([0, 1, 1], name="result") pd.testing.assert_series_equal(result_pd, expected_quotient) # remainder - result = df.insert(0, "result", result_remainder) + result = df.assign(result_remainder.rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected_remainder = pd.Series([1, 0, 1], name="result") pd.testing.assert_series_equal(result_pd, expected_remainder) diff --git a/tests/column/fill_nan_test.py b/tests/column/fill_nan_test.py index 5f9039b3..85636923 100644 --- a/tests/column/fill_nan_test.py +++ b/tests/column/fill_nan_test.py @@ -8,9 +8,9 @@ def test_column_fill_nan(library: str) -> None: # todo: test with nullable pandas, check null isn't filled - df = nan_dataframe_1(library) + df = nan_dataframe_1(library).collect() ser = df.get_column_by_name("a") - result = df.insert(0, "result", ser.fill_nan(-1.0)) + result = df.assign(ser.fill_nan(-1.0).rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([1.0, 2.0, -1.0], name="result") pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/fill_null_test.py b/tests/column/fill_null_test.py index 1dd1e4a7..eee621f6 100644 --- a/tests/column/fill_null_test.py +++ b/tests/column/fill_null_test.py @@ -6,20 +6,20 @@ def test_fill_null_column(library: str) -> None: - df = null_dataframe_2(library) + df = null_dataframe_2(library).collect() ser = df.get_column_by_name("a") - result = df.insert(0, "result", ser.fill_null(0)) - result = maybe_collect(result, library)["result"] + result = df.assign(ser.fill_null(0).rename("result")).relax() + result = maybe_collect(result)["result"] assert result[2] == 0.0 assert result[1] != 0.0 assert result[0] != 0.0 def test_fill_null_noop_column(library: str) -> None: - df = nan_dataframe_1(library) + df = nan_dataframe_1(library).collect() ser = df.get_column_by_name("a") - result = df.insert(0, "result", ser.fill_null(0)) - result = maybe_collect(result, library)["result"] + result = df.assign(ser.fill_null(0).rename("result")).relax() + result = maybe_collect(result)["result"] if library != "pandas-numpy": # nan should not have changed! assert result[2] != result[2] diff --git a/tests/column/get_rows_by_mask_test.py b/tests/column/get_rows_by_mask_test.py index 2d5b6a83..3a76cb44 100644 --- a/tests/column/get_rows_by_mask_test.py +++ b/tests/column/get_rows_by_mask_test.py @@ -1,24 +1,31 @@ from __future__ import annotations import pandas as pd -import pytest from tests.utils import integer_dataframe_1 from tests.utils import interchange_to_pandas -def test_column_filter(library: str, request: pytest.FixtureRequest) -> None: - df = integer_dataframe_1(library) +def test_column_filter(library: str) -> None: + df = integer_dataframe_1(library).collect() ser = df.get_column_by_name("a") mask = ser > 1 ser = ser.filter(mask) result = df.filter(mask) - if library == "polars-lazy": - # created from a different dataframe - with pytest.raises(ValueError): - result = result.insert(0, "result", ser) - return - result = result.insert(0, "result", ser) + result = result.assign(ser.rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series([2, 3], name="result") + pd.testing.assert_series_equal(result_pd, expected) + + +def test_expression_filter(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") + mask = ser > 1 + ser = ser.filter(mask) + result = df.filter(mask) + result = result.assign(ser.rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([2, 3], name="result") pd.testing.assert_series_equal(result_pd, expected) @@ -26,10 +33,11 @@ def test_column_filter(library: str, request: pytest.FixtureRequest) -> None: def test_column_get_rows_by_mask_noop(library: str) -> None: df = integer_dataframe_1(library) - ser = df.get_column_by_name("a") + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") mask = ser > 0 ser = ser.filter(mask) - result = df.insert(0, "result", ser) + result = df.assign(ser.rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([1, 2, 3], name="result") pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/get_rows_test.py b/tests/column/get_rows_test.py index f28cf28e..44a5a99e 100644 --- a/tests/column/get_rows_test.py +++ b/tests/column/get_rows_test.py @@ -1,17 +1,13 @@ from __future__ import annotations import pandas as pd -import pytest from tests.utils import integer_dataframe_1 from tests.utils import interchange_to_pandas -def test_column_get_rows(library: str, request: pytest.FixtureRequest) -> None: - if library == "polars-lazy": - # lazy column.get_rows not generally supported - request.node.add_marker(pytest.mark.xfail()) - df = integer_dataframe_1(library) +def test_column_get_rows(library: str) -> None: + df = integer_dataframe_1(library).collect() ser = df.get_column_by_name("a") namespace = ser.__column_namespace__() indices = namespace.column_from_sequence( @@ -23,3 +19,15 @@ def test_column_get_rows(library: str, request: pytest.FixtureRequest) -> None: result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([1, 3, 2], name="result") pd.testing.assert_series_equal(result_pd, expected) + + +def test_expression_get_rows(library: str) -> None: + df = integer_dataframe_1(library) + col = df.__dataframe_namespace__().col + ser = col("a") + ser.__column_namespace__() + indices = col("a") - 1 + result = df.select(ser.get_rows(indices).rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series([1, 2, 3], name="result") + pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/get_value_test.py b/tests/column/get_value_test.py index 28f9721d..1a64d33c 100644 --- a/tests/column/get_value_test.py +++ b/tests/column/get_value_test.py @@ -1,8 +1,21 @@ from __future__ import annotations +import pandas as pd + +from tests.utils import integer_dataframe_1 from tests.utils import integer_series_1 +from tests.utils import interchange_to_pandas def test_get_value(library: str) -> None: result = integer_series_1(library).get_value(0) assert result == 1 + + +def test_get_value_expr(library: str) -> None: + df = integer_dataframe_1(library) + col = df.__dataframe_namespace__().col + result = df.select(col("a").get_value(0)) + result_pd = interchange_to_pandas(result, library) + expected = pd.DataFrame({"a": [1]}) + pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/column/invert_test.py b/tests/column/invert_test.py index 03e350ef..5efb77ab 100644 --- a/tests/column/invert_test.py +++ b/tests/column/invert_test.py @@ -6,10 +6,20 @@ from tests.utils import interchange_to_pandas -def test_column_invert(library: str) -> None: +def test_expression_invert(library: str) -> None: df = bool_dataframe_1(library) + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") + result = df.assign((~ser).rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series([False, False, True], name="result") + pd.testing.assert_series_equal(result_pd, expected) + + +def test_column_invert(library: str) -> None: + df = bool_dataframe_1(library).collect() ser = df.get_column_by_name("a") - result = df.insert(0, "result", ~ser) + result = df.assign((~ser).rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([False, False, True], name="result") pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/is_in_test.py b/tests/column/is_in_test.py index e7c40e0a..859598ed 100644 --- a/tests/column/is_in_test.py +++ b/tests/column/is_in_test.py @@ -9,7 +9,6 @@ from tests.utils import float_dataframe_1 from tests.utils import float_dataframe_2 from tests.utils import float_dataframe_3 -from tests.utils import integer_dataframe_1 from tests.utils import interchange_to_pandas if TYPE_CHECKING: @@ -30,17 +29,34 @@ def test_is_in( expected_values: list[bool], request: pytest.FixtureRequest, ) -> None: - df = df_factory(library, request) + df = df_factory(library, request).collect() ser = df.get_column_by_name("a") other = ser + 1 - result = df.insert(0, "result", ser.is_in(other)) + result = df.assign(ser.is_in(other).rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series(expected_values, name="result") pd.testing.assert_series_equal(result_pd, expected) -def test_is_in_raises(library: str) -> None: - ser = integer_dataframe_1(library).get_column_by_name("a") - other = ser * 1.0 - with pytest.raises(ValueError): - ser.is_in(other) +@pytest.mark.parametrize( + ("df_factory", "expected_values"), + [ + (float_dataframe_1, [False, True]), + (float_dataframe_2, [True, False]), + (float_dataframe_3, [True, False]), + ], +) +def test_expr_is_in( + library: str, + df_factory: Callable[[str, pytest.FixtureRequest], Any], + expected_values: list[bool], + request: pytest.FixtureRequest, +) -> None: + df = df_factory(library, request) + col = df.__dataframe_namespace__().col + ser = col("a") + other = ser + 1 + result = df.assign(ser.is_in(other).rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series(expected_values, name="result") + pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/is_nan_test.py b/tests/column/is_nan_test.py index fc0253aa..7ada5a02 100644 --- a/tests/column/is_nan_test.py +++ b/tests/column/is_nan_test.py @@ -7,9 +7,9 @@ def test_column_is_nan(library: str) -> None: - df = nan_dataframe_1(library) + df = nan_dataframe_1(library).collect() ser = df.get_column_by_name("a") - result = df.insert(0, "result", ser.is_nan()) + result = df.assign(ser.is_nan().rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([False, False, True], name="result") pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/is_null_test.py b/tests/column/is_null_test.py index 2b2ad076..e3a6eed2 100644 --- a/tests/column/is_null_test.py +++ b/tests/column/is_null_test.py @@ -1,21 +1,16 @@ from __future__ import annotations -from typing import TYPE_CHECKING - import pandas as pd from tests.utils import interchange_to_pandas from tests.utils import nan_dataframe_1 from tests.utils import null_dataframe_1 -if TYPE_CHECKING: - import pytest - -def test_column_is_null_1(library: str, request: pytest.FixtureRequest) -> None: - df = nan_dataframe_1(library) +def test_column_is_null_1(library: str) -> None: + df = nan_dataframe_1(library).collect() ser = df.get_column_by_name("a") - result = df.insert(0, "result", ser.is_null()) + result = df.assign(ser.is_null().rename("result")) result_pd = interchange_to_pandas(result, library)["result"] if library == "pandas-numpy": expected = pd.Series([False, False, True], name="result") @@ -24,10 +19,10 @@ def test_column_is_null_1(library: str, request: pytest.FixtureRequest) -> None: pd.testing.assert_series_equal(result_pd, expected) -def test_column_is_null_2(library: str, request: pytest.FixtureRequest) -> None: - df = null_dataframe_1(library) +def test_column_is_null_2(library: str) -> None: + df = null_dataframe_1(library).collect() ser = df.get_column_by_name("a") - result = df.insert(0, "result", ser.is_null()) + result = df.assign(ser.is_null().rename("result")) result_pd = interchange_to_pandas(result, library)["result"] expected = pd.Series([False, False, True], name="result") pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/len_test.py b/tests/column/len_test.py index 052c6f0b..acb63014 100644 --- a/tests/column/len_test.py +++ b/tests/column/len_test.py @@ -1,8 +1,23 @@ from __future__ import annotations +import pandas as pd + +from tests.utils import integer_dataframe_1 from tests.utils import integer_series_1 +from tests.utils import interchange_to_pandas def test_column_len(library: str) -> None: - result = len(integer_series_1(library)) + result = integer_series_1(library).len() assert result == 3 + + +def test_expr_len(library: str) -> None: + df = integer_dataframe_1(library) + col = df.__dataframe_namespace__().col + result = df.select(col("a").len()) + result_pd = interchange_to_pandas(result, library) + expected = pd.DataFrame({"a": [3]}) + if library == "polars-lazy": + result_pd["a"] = result_pd["a"].astype("int64") + pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/column/name_test.py b/tests/column/name_test.py index 7fd62332..e0f94554 100644 --- a/tests/column/name_test.py +++ b/tests/column/name_test.py @@ -1,9 +1,17 @@ from __future__ import annotations +import pandas as pd +import pytest + from tests.utils import integer_dataframe_1 def test_name(library: str) -> None: - df = integer_dataframe_1(library) + df = integer_dataframe_1(library).collect() name = df.get_column_by_name("a").name assert name == "a" + + +def test_invalid_name_pandas() -> None: + with pytest.raises(ValueError): + pd.Series([1, 2, 3], name=0).__column_consortium_standard__() diff --git a/tests/column/output_name_test.py b/tests/column/output_name_test.py new file mode 100644 index 00000000..fc2d9ed4 --- /dev/null +++ b/tests/column/output_name_test.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from tests.utils import integer_dataframe_1 + + +def test_output_name(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + + assert col("a").output_name == "a" + assert col("b").output_name == "b" + assert col("b").rename("c").output_name == "c" + assert (col("b") + col("a")).output_name == "b" + assert (col("b") + col("a") + col("a")).output_name == "b" + assert namespace.any_rowwise(["a", "b"]).output_name == "any" diff --git a/tests/column/pow_test.py b/tests/column/pow_test.py index 02814dd5..a8245961 100644 --- a/tests/column/pow_test.py +++ b/tests/column/pow_test.py @@ -14,12 +14,13 @@ def test_float_powers_column(library: str, request: pytest.FixtureRequest) -> None: df = integer_dataframe_1(library) - ser = df.get_column_by_name("a") - other = df.get_column_by_name("b") * 1.0 - result = df.insert(0, "result", ser.__pow__(other)) + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") + other = namespace.col("b") * 1.0 + result = df.assign(ser.__pow__(other).rename("result")) result_pd = interchange_to_pandas(result, library) expected = pd.DataFrame( - {"result": [1.0, 32.0, 729.0], "a": [1, 2, 3], "b": [4, 5, 6]} + {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 32.0, 729.0]} ) result_pd = convert_dataframe_to_pandas_numpy(result_pd) pd.testing.assert_frame_equal(result_pd, expected) @@ -27,32 +28,39 @@ def test_float_powers_column(library: str, request: pytest.FixtureRequest) -> No def test_float_powers_scalar_column(library: str, request: pytest.FixtureRequest) -> None: df = integer_dataframe_1(library) - ser = df.get_column_by_name("a") + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") other = 1.0 - result = df.insert(0, "result", ser.__pow__(other)) + result = df.assign(ser.__pow__(other).rename("result")) result_pd = interchange_to_pandas(result, library) - expected = pd.DataFrame({"result": [1.0, 2.0, 3.0], "a": [1, 2, 3], "b": [4, 5, 6]}) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 2.0, 3.0]}) result_pd = convert_dataframe_to_pandas_numpy(result_pd) pd.testing.assert_frame_equal(result_pd, expected) def test_int_powers_column(library: str, request: pytest.FixtureRequest) -> None: df = integer_dataframe_1(library) - ser = df.get_column_by_name("a") - other = df.get_column_by_name("b") * 1 - result = df.insert(0, "result", ser.__pow__(other)) + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") + other = namespace.col("b") * 1 + result = df.assign(ser.__pow__(other).rename("result")) result_pd = interchange_to_pandas(result, library) - expected = pd.DataFrame({"result": [1, 32, 729], "a": [1, 2, 3], "b": [4, 5, 6]}) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 32, 729]}) result_pd = convert_dataframe_to_pandas_numpy(result_pd) + if library in ("polars", "polars-lazy"): + result_pd = result_pd.astype("int64") pd.testing.assert_frame_equal(result_pd, expected) def test_int_powers_scalar_column(library: str, request: pytest.FixtureRequest) -> None: df = integer_dataframe_1(library) - ser = df.get_column_by_name("a") + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") other = 1 - result = df.insert(0, "result", ser.__pow__(other)) + result = df.assign(ser.__pow__(other).rename("result")) result_pd = interchange_to_pandas(result, library) - expected = pd.DataFrame({"result": [1, 2, 3], "a": [1, 2, 3], "b": [4, 5, 6]}) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 2, 3]}) result_pd = convert_dataframe_to_pandas_numpy(result_pd) + if library in ("polars", "polars-lazy"): + result_pd = result_pd.astype("int64") pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/column/reductions_test.py b/tests/column/reductions_test.py index 1d263d04..98365630 100644 --- a/tests/column/reductions_test.py +++ b/tests/column/reductions_test.py @@ -27,7 +27,7 @@ def test_column_reductions( df = integer_dataframe_1(library) ser = integer_series_1(library) ser = ser - getattr(ser, reduction)() - result = df.insert(0, "result", ser) + result = df.assign(ser.rename("result")) result_pd = interchange_to_pandas(result, library)["result"] ser_pd = interchange_to_pandas(df, library)["a"].rename("result") expected_pd = ser_pd - expected @@ -47,13 +47,14 @@ def test_column_reductions( ("var", 1.0), ], ) -def test_column_reference_reductions( +def test_expression_reductions( library: str, reduction: str, expected: float, request: pytest.FixtureRequest ) -> None: df = integer_dataframe_1(library) - ser = df.get_column_by_name("a") + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") ser = ser - getattr(ser, reduction)() - result = df.insert(0, "result", ser) + result = df.assign(ser.rename("result")) result_pd = interchange_to_pandas(result, library)["result"] ser_pd = interchange_to_pandas(df, library)["a"].rename("result") expected_pd = ser_pd - expected diff --git a/tests/column/rename_test.py b/tests/column/rename_test.py index ed67d704..eddcaf7e 100644 --- a/tests/column/rename_test.py +++ b/tests/column/rename_test.py @@ -4,7 +4,7 @@ def test_rename(library: str) -> None: - df = integer_dataframe_1(library) + df = integer_dataframe_1(library).collect() ser = df.get_column_by_name("a") result = ser.rename("new_name") assert result.name == "new_name" diff --git a/tests/column/root_names_test.py b/tests/column/root_names_test.py new file mode 100644 index 00000000..03abe125 --- /dev/null +++ b/tests/column/root_names_test.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from tests.utils import integer_dataframe_1 + + +def test_root_names(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + + assert col("a").root_names == ["a"] + assert col("b").root_names == ["b"] + assert col("b").rename("c").root_names == ["b"] + assert (col("b") + col("a")).root_names == ["a", "b"] + assert (col("b") + col("a") + col("a")).root_names == ["a", "b"] + assert namespace.any_rowwise("a", "b").root_names == ["a", "b"] diff --git a/tests/column/slice_rows_test.py b/tests/column/slice_rows_test.py index fe6a02a9..cf1ea30e 100644 --- a/tests/column/slice_rows_test.py +++ b/tests/column/slice_rows_test.py @@ -34,3 +34,29 @@ def test_column_slice_rows( namespace.dataframe_from_dict({"result": (result).rename("result")}), library )["result"] pd.testing.assert_series_equal(result_pd, expected) + + +@pytest.mark.parametrize( + ("start", "stop", "step", "expected"), + [ + (2, 7, 2, pd.Series([3, 5, 7], name="result")), + (None, 7, 2, pd.Series([1, 3, 5, 7], name="result")), + (2, None, 2, pd.Series([3, 5, 7], name="result")), + (2, None, None, pd.Series([3, 4, 5, 6, 7], name="result")), + ], +) +def test_expr_slice_rows( + library: str, + start: int | None, + stop: int | None, + step: int | None, + expected: pd.Series[Any], +) -> None: + df = integer_dataframe_3(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + result_pd = interchange_to_pandas( + df.select(col("a").slice_rows(start, stop, step).rename("result")), + library=library, + )["result"] + pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/column/sort_test.py b/tests/column/sort_test.py index 15b36800..5a6f9ef6 100644 --- a/tests/column/sort_test.py +++ b/tests/column/sort_test.py @@ -7,10 +7,44 @@ from tests.utils import interchange_to_pandas -def test_column_sort_ascending(library: str) -> None: +def test_expression_sort_ascending(library: str) -> None: + df = integer_dataframe_6(library, api_version="2023.09-beta") + namespace = df.__dataframe_namespace__() + sorted = namespace.col("b").sort().rename("c") + result = df.assign(sorted) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "c": [1, 2, 3, 4, 4], + } + ) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_expression_sort_descending(library: str) -> None: df = integer_dataframe_6(library, api_version="2023.09-beta") + namespace = df.__dataframe_namespace__() + sorted = namespace.col("b").sort(ascending=False).rename("c") + result = df.assign(sorted) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "c": [4, 4, 3, 2, 1], + } + ) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_column_sort_ascending(library: str) -> None: + df = integer_dataframe_6(library, api_version="2023.09-beta").collect() sorted = df.get_column_by_name("b").sort().rename("c") - result = df.insert_column(sorted) + result = df.assign(sorted) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame( @@ -24,9 +58,9 @@ def test_column_sort_ascending(library: str) -> None: def test_column_sort_descending(library: str) -> None: - df = integer_dataframe_6(library, api_version="2023.09-beta") + df = integer_dataframe_6(library, api_version="2023.09-beta").collect() sorted = df.get_column_by_name("b").sort(ascending=False).rename("c") - result = df.insert_column(sorted) + result = df.assign(sorted) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame( diff --git a/tests/column/statistics_test.py b/tests/column/statistics_test.py index e5bcd520..104b1d01 100644 --- a/tests/column/statistics_test.py +++ b/tests/column/statistics_test.py @@ -2,16 +2,24 @@ from typing import TYPE_CHECKING +import pandas as pd + +from tests.utils import integer_dataframe_1 from tests.utils import integer_series_1 from tests.utils import integer_series_5 +from tests.utils import interchange_to_pandas if TYPE_CHECKING: import pytest -def test_mean(library: str, request: pytest.FixtureRequest) -> None: - result = integer_series_5(library, request).mean() - assert result == 2.0 +def test_mean(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + result = df.assign((namespace.col("a") - namespace.col("a").mean()).rename("result")) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series([-1, 0, 1.0], name="result") + pd.testing.assert_series_equal(result_pd, expected) def test_std(library: str, request: pytest.FixtureRequest) -> None: diff --git a/tests/column/unique_indices_test.py b/tests/column/unique_indices_test.py index 28095aad..f3b99a3e 100644 --- a/tests/column/unique_indices_test.py +++ b/tests/column/unique_indices_test.py @@ -1,19 +1,20 @@ from __future__ import annotations import pandas as pd +import pytest from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import integer_dataframe_6 from tests.utils import interchange_to_pandas -def test_unique_indices_column(library: str) -> None: - namespace = integer_dataframe_6(library).__dataframe_namespace__() - ser = namespace.column_from_sequence( - [4, 4, 3, 1, 2], name="b", dtype=namespace.Int64() - ) +def test_unique_indices_column( + library: str, request: pytest.FixtureRequest +) -> None: # pragma: no cover + request.node.add_marker(pytest.mark.xfail()) df = integer_dataframe_6(library) - df = df.get_rows(ser.unique_indices()) + namespace = df.__dataframe_namespace__() + df = df.get_rows(namespace.unique_indices(["b"])) result = df.get_rows(df.sorted_indices()) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) diff --git a/tests/dataframe/all_rowwise_test.py b/tests/dataframe/all_rowwise_test.py index 80be3a57..fb1b28a8 100644 --- a/tests/dataframe/all_rowwise_test.py +++ b/tests/dataframe/all_rowwise_test.py @@ -9,8 +9,8 @@ def test_all_rowwise(library: str) -> None: df = bool_dataframe_1(library) - df.__dataframe_namespace__() - result = df.filter(df.all_rowwise()) + namespace = df.__dataframe_namespace__() + result = df.filter(namespace.all_rowwise()) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"a": [True, True], "b": [True, True]}) diff --git a/tests/dataframe/and_test.py b/tests/dataframe/and_test.py index 7250d099..ada2de63 100644 --- a/tests/dataframe/and_test.py +++ b/tests/dataframe/and_test.py @@ -1,19 +1,19 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import Any +from typing import Callable import pandas as pd +import pytest from tests.utils import bool_dataframe_1 from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import interchange_to_pandas -if TYPE_CHECKING: - import pytest - -def test_and_with_scalar(library: str, request: pytest.FixtureRequest) -> None: - df = bool_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_and_with_scalar(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(bool_dataframe_1(library)) other = True result = df & other result_pd = interchange_to_pandas(result, library) diff --git a/tests/dataframe/any_all_test.py b/tests/dataframe/any_all_test.py index 4747c023..e9bde1e7 100644 --- a/tests/dataframe/any_all_test.py +++ b/tests/dataframe/any_all_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd import pytest @@ -16,12 +19,14 @@ ("all", {"a": [False], "b": [True]}), ], ) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) def test_reductions( library: str, reduction: str, expected_data: dict[str, object], + relax: Callable[[Any], Any], ) -> None: - df = bool_dataframe_1(library) + df = relax(bool_dataframe_1(library)) result = getattr(df, reduction)() result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) @@ -29,17 +34,21 @@ def test_reductions( pd.testing.assert_frame_equal(result_pd, expected) -def test_any(library: str) -> None: - df = bool_dataframe_3(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_any(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(bool_dataframe_3(library)) result = df.any() result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"a": [False], "b": [True], "c": [True]}) pd.testing.assert_frame_equal(result_pd, expected) -def test_all(library: str) -> None: - df = bool_dataframe_3(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_all(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(bool_dataframe_3(library)) result = df.all() result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"a": [False], "b": [False], "c": [True]}) pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/dataframe/any_rowwise_test.py b/tests/dataframe/any_rowwise_test.py index 86d9aa41..1343f283 100644 --- a/tests/dataframe/any_rowwise_test.py +++ b/tests/dataframe/any_rowwise_test.py @@ -9,8 +9,8 @@ def test_any_rowwise(library: str) -> None: df = bool_dataframe_1(library) - df.__dataframe_namespace__() - result = df.filter(df.any_rowwise()) + namespace = df.__dataframe_namespace__() + result = df.filter(namespace.any_rowwise()) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"a": [True, True, False], "b": [True, True, True]}) diff --git a/tests/dataframe/comparisons_test.py b/tests/dataframe/comparisons_test.py index 25517c4e..7a3537fe 100644 --- a/tests/dataframe/comparisons_test.py +++ b/tests/dataframe/comparisons_test.py @@ -1,7 +1,9 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd -import polars as pl import pytest from tests.utils import convert_dataframe_to_pandas_numpy @@ -27,25 +29,17 @@ ("__mod__", {"a": [1, 0, 1], "b": [0, 1, 0]}), ], ) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) def test_comparisons_with_scalar( library: str, comparison: str, expected_data: dict[str, object], - request: pytest.FixtureRequest, + relax: Callable[[Any], Any], ) -> None: - df = integer_dataframe_1(library) + df = relax(integer_dataframe_1(library)) other = 2 result = getattr(df, comparison)(other) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame(expected_data) pd.testing.assert_frame_equal(result_pd, expected) - - -def test_comparison_invalid(library: str, request: pytest.FixtureRequest) -> None: - df = integer_dataframe_1(library).select(["a"]) - other = integer_dataframe_1(library).select(["b"]) - with pytest.raises( - (ValueError, pl.exceptions.DuplicateError, NotImplementedError), - ): - assert df > other diff --git a/tests/dataframe/divmod_test.py b/tests/dataframe/divmod_test.py index e6be8569..7fcd8288 100644 --- a/tests/dataframe/divmod_test.py +++ b/tests/dataframe/divmod_test.py @@ -1,19 +1,19 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import Any +from typing import Callable import pandas as pd +import pytest from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import integer_dataframe_1 from tests.utils import interchange_to_pandas -if TYPE_CHECKING: - import pytest - -def test_divmod_with_scalar(library: str, request: pytest.FixtureRequest) -> None: - df = integer_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_divmod_with_scalar(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(integer_dataframe_1(library)) other = 2 result_quotient, result_remainder = df.__divmod__(other) result_quotient_pd = interchange_to_pandas(result_quotient, library) diff --git a/tests/dataframe/drop_column_test.py b/tests/dataframe/drop_column_test.py index dd720e2f..de20172d 100644 --- a/tests/dataframe/drop_column_test.py +++ b/tests/dataframe/drop_column_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd import pytest @@ -8,16 +11,11 @@ from tests.utils import interchange_to_pandas -def test_drop_column(library: str) -> None: - df = integer_dataframe_1(library) - result = df.drop_column("a") +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_drop_column(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(integer_dataframe_1(library)) + result = df.drop_columns("a") result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"b": [4, 5, 6]}) pd.testing.assert_frame_equal(result_pd, expected) - - -def test_drop_column_invalid(library: str) -> None: - df = integer_dataframe_1(library) - with pytest.raises(TypeError, match="Expected str, got: <class 'list'>"): - df.drop_column(["a"]) diff --git a/tests/dataframe/fill_nan_test.py b/tests/dataframe/fill_nan_test.py index 3f434ac9..78f6d6ef 100644 --- a/tests/dataframe/fill_nan_test.py +++ b/tests/dataframe/fill_nan_test.py @@ -1,13 +1,18 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd +import pytest from tests.utils import interchange_to_pandas from tests.utils import nan_dataframe_1 -def test_fill_nan(library: str) -> None: - df = nan_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_fill_nan(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(nan_dataframe_1(library)) result = df.fill_nan(-1) result_pd = interchange_to_pandas(result, library) result_pd = result_pd.astype("float64") @@ -15,8 +20,9 @@ def test_fill_nan(library: str) -> None: pd.testing.assert_frame_equal(result_pd, expected) -def test_fill_nan_with_null(library: str) -> None: - df = nan_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_fill_nan_with_null(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(nan_dataframe_1(library)) namespace = df.__dataframe_namespace__() result = df.fill_nan(namespace.null) n_nans = result.is_nan().sum() diff --git a/tests/dataframe/fill_null_test.py b/tests/dataframe/fill_null_test.py index d31c6c4b..9213bcb4 100644 --- a/tests/dataframe/fill_null_test.py +++ b/tests/dataframe/fill_null_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pytest from tests.utils import interchange_to_pandas @@ -17,29 +20,31 @@ ["b"], ], ) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) def test_fill_null( - library: str, - column_names: list[str] | None, + library: str, column_names: list[str] | None, relax: Callable[[Any], Any] ) -> None: - df = null_dataframe_2(library) + df = relax(null_dataframe_2(library)) + namespace = df.__dataframe_namespace__() result = df.fill_null(0, column_names=column_names) if column_names is None or "a" in column_names: - res1 = result.filter(result.get_column_by_name("a").is_null()) - res1 = maybe_collect(res1, library) + res1 = result.filter(namespace.col("a").is_null()) + res1 = maybe_collect(res1) # check there no nulls left in the column assert res1.__dataframe__().num_rows() == 0 # check the last element was filled with 0 assert interchange_to_pandas(result, library)["a"].iloc[2] == 0 if column_names is None or "b" in column_names: - res1 = result.filter(result.get_column_by_name("b").is_null()) - res1 = maybe_collect(res1, library) + res1 = result.filter(namespace.col("b").is_null()) + res1 = maybe_collect(res1) assert res1.__dataframe__().num_rows() == 0 assert interchange_to_pandas(result, library)["b"].iloc[2] == 0 -def test_fill_null_noop(library: str) -> None: - df = nan_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_fill_null_noop(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(nan_dataframe_1(library)) result = df.fill_null(0) if hasattr(result.dataframe, "collect"): result = result.dataframe.collect() diff --git a/tests/dataframe/get_column_by_name_test.py b/tests/dataframe/get_column_by_name_test.py index 604bd350..58dfd8a6 100644 --- a/tests/dataframe/get_column_by_name_test.py +++ b/tests/dataframe/get_column_by_name_test.py @@ -1,25 +1,20 @@ from __future__ import annotations import pandas as pd -import pytest from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import integer_dataframe_1 from tests.utils import interchange_to_pandas -def test_get_column_by_name(library: str) -> None: +def test_get_column(library: str) -> None: df = integer_dataframe_1(library) - result = df.get_column_by_name("a").rename("_tmp") - result = df.insert(0, "_tmp", result).drop_column("a").rename_columns({"_tmp": "a"}) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = col("a").rename("_tmp") + result = df.assign(result).drop_columns("a").rename_columns({"_tmp": "a"}) df.__dataframe_namespace__() result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) - expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})[["b", "a"]] pd.testing.assert_frame_equal(result_pd, expected) - - -def test_get_column_by_name_invalid(library: str) -> None: - df = integer_dataframe_1(library) - with pytest.raises((ValueError, TypeError)): - df.get_column_by_name([True, False]) diff --git a/tests/dataframe/get_columns_by_name_test.py b/tests/dataframe/get_columns_by_name_test.py index c892c49b..51dda9a1 100644 --- a/tests/dataframe/get_columns_by_name_test.py +++ b/tests/dataframe/get_columns_by_name_test.py @@ -1,7 +1,6 @@ from __future__ import annotations import pandas as pd -import pytest from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import integer_dataframe_1 @@ -10,14 +9,72 @@ def test_select(library: str) -> None: df = integer_dataframe_1(library) - result = df.select(["b"]) + result = df.select("b") result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"b": [4, 5, 6]}) pd.testing.assert_frame_equal(result_pd, expected) -def test_get_columns_by_name_invalid(library: str) -> None: +def test_select_list_of_str(library: str) -> None: df = integer_dataframe_1(library) - with pytest.raises(TypeError, match=r"Expected sequence of str, got <class \'str\'>"): - df.select("b") + result = df.select("a", "b") + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_select_expressions(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = df.select(col("b") + 1) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame({"b": [5, 6, 7]}) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_select_multiple_expressions(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = df.select(col("b") + 1, col("b").rename("c") + 2) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame({"b": [5, 6, 7], "c": [6, 7, 8]}) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_select_reduction(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = df.select(col("a").mean(), col("b").mean()) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame({"a": [2.0], "b": [5.0]}) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_select_broadcast_right(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = df.select(col("a"), col("b").mean()) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [5.0, 5.0, 5.0]}) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_select_broadcast_left(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = df.select(col("a").mean(), col("b")) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame({"a": [2.0, 2.0, 2.0], "b": [4, 5, 6]}) + pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/dataframe/get_rows_by_mask_test.py b/tests/dataframe/get_rows_by_mask_test.py index b469fd84..674e9fb2 100644 --- a/tests/dataframe/get_rows_by_mask_test.py +++ b/tests/dataframe/get_rows_by_mask_test.py @@ -1,31 +1,18 @@ from __future__ import annotations import pandas as pd -import pytest from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import integer_dataframe_1 -from tests.utils import integer_dataframe_2 from tests.utils import interchange_to_pandas def test_filter(library: str) -> None: df = integer_dataframe_1(library) namespace = df.__dataframe_namespace__() - mask = namespace.column_from_sequence( - [True, False, True], dtype=namespace.Bool(), name="result" - ) + mask = namespace.col("a") % 2 == 1 result = df.filter(mask) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"a": [1, 3], "b": [4, 6]}) pd.testing.assert_frame_equal(result_pd, expected) - - -def test_get_column_by_name_invalid_lazy() -> None: - df1 = integer_dataframe_1("polars-lazy") - df2 = integer_dataframe_2("polars-lazy") - with pytest.raises( - ValueError, match="Column was created from a different dataframe!" - ): - df1.filter(df2.get_column_by_name("a") > 0) diff --git a/tests/dataframe/get_rows_test.py b/tests/dataframe/get_rows_test.py index 1fe36737..6a07587a 100644 --- a/tests/dataframe/get_rows_test.py +++ b/tests/dataframe/get_rows_test.py @@ -10,11 +10,10 @@ def test_get_rows(library: str) -> None: df = integer_dataframe_1(library) namespace = df.__dataframe_namespace__() - indices = namespace.column_from_sequence( - [0, 2, 1], dtype=namespace.Int64(), name="result" - ) - result = df.get_rows(indices) + df = df.assign((namespace.col("a") - 1).sort(ascending=False).rename("result")) + namespace = df.__dataframe_namespace__() + result = df.get_rows(namespace.col("result")) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) - expected = pd.DataFrame({"a": [1, 3, 2], "b": [4, 6, 5]}) + expected = pd.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "result": [0, 1, 2]}) pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/dataframe/insert_test.py b/tests/dataframe/insert_test.py index a86ded30..d39070d5 100644 --- a/tests/dataframe/insert_test.py +++ b/tests/dataframe/insert_test.py @@ -12,16 +12,14 @@ import pytest -def test_insert(library: str, request: pytest.FixtureRequest) -> None: - df = integer_dataframe_1(library) +def test_insert_columns(library: str, request: pytest.FixtureRequest) -> None: + df = integer_dataframe_1(library, api_version="2023.09-beta") namespace = df.__dataframe_namespace__() - new_col = namespace.column_from_sequence( - [7, 8, 9], dtype=namespace.Int64(), name="result" - ) - result = df.insert(1, "c", new_col) + new_col = (namespace.col("b") + 3).rename("result") + result = df.assign(new_col.rename("c")) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) - expected = pd.DataFrame({"a": [1, 2, 3], "c": [7, 8, 9], "b": [4, 5, 6]}) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) pd.testing.assert_frame_equal(result_pd, expected) # check original df didn't change df_pd = interchange_to_pandas(df, library) @@ -30,19 +28,60 @@ def test_insert(library: str, request: pytest.FixtureRequest) -> None: pd.testing.assert_frame_equal(df_pd, expected) -def test_insert_columns(library: str, request: pytest.FixtureRequest) -> None: +def test_insert_multiple_columns(library: str, request: pytest.FixtureRequest) -> None: df = integer_dataframe_1(library, api_version="2023.09-beta") namespace = df.__dataframe_namespace__() - new_col = namespace.column_from_sequence( - [7, 8, 9], dtype=namespace.Int64(), name="result" + new_col = (namespace.col("b") + 3).rename("result") + result = df.assign(new_col.rename("c"), new_col.rename("d")) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9], "d": [7, 8, 9]} ) - result = df.insert_column(new_col.rename("c")) + pd.testing.assert_frame_equal(result_pd, expected) + # check original df didn't change + df_pd = interchange_to_pandas(df, library) + df_pd = convert_dataframe_to_pandas_numpy(df_pd) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + pd.testing.assert_frame_equal(df_pd, expected) + + +def test_insert_eager_columns(library: str, request: pytest.FixtureRequest) -> None: + df = integer_dataframe_1(library, api_version="2023.09-beta") + new_col = (df.collect().get_column_by_name("b") + 3).rename("result") + result = df.assign(new_col.rename("c"), new_col.rename("d")) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) - expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + expected = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9], "d": [7, 8, 9]} + ) pd.testing.assert_frame_equal(result_pd, expected) # check original df didn't change df_pd = interchange_to_pandas(df, library) df_pd = convert_dataframe_to_pandas_numpy(df_pd) expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) pd.testing.assert_frame_equal(df_pd, expected) + + +def test_insert_reduction(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + new_col = (namespace.col("b").mean()).rename("result") + result = df.assign(new_col) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "result": [5.0, 5.0, 5.0]}) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_insert_reduction_and_column(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = df.assign(col("b").mean().rename("c"), col("b").rename("d")) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [5.0, 5.0, 5.0], "d": [4, 5, 6]} + ) + pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/dataframe/invert_test.py b/tests/dataframe/invert_test.py index e4bd9332..9e8134ba 100644 --- a/tests/dataframe/invert_test.py +++ b/tests/dataframe/invert_test.py @@ -1,14 +1,19 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd +import pytest from tests.utils import bool_dataframe_1 from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import interchange_to_pandas -def test_invert(library: str) -> None: - df = bool_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_invert(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(bool_dataframe_1(library)) result = ~df result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) diff --git a/tests/dataframe/is_nan_test.py b/tests/dataframe/is_nan_test.py index fd4e7039..14c92b8b 100644 --- a/tests/dataframe/is_nan_test.py +++ b/tests/dataframe/is_nan_test.py @@ -1,18 +1,18 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import Any +from typing import Callable import pandas as pd +import pytest from tests.utils import interchange_to_pandas from tests.utils import nan_dataframe_1 -if TYPE_CHECKING: - import pytest - -def test_dataframe_is_nan(library: str, request: pytest.FixtureRequest) -> None: - df = nan_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_dataframe_is_nan(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(nan_dataframe_1(library)) result = df.is_nan() result_pd = interchange_to_pandas(result, library) expected = pd.DataFrame({"a": [False, False, True]}) diff --git a/tests/dataframe/is_null_test.py b/tests/dataframe/is_null_test.py index cb351923..de6204f3 100644 --- a/tests/dataframe/is_null_test.py +++ b/tests/dataframe/is_null_test.py @@ -1,14 +1,19 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd +import pytest from tests.utils import interchange_to_pandas from tests.utils import nan_dataframe_2 from tests.utils import null_dataframe_1 -def test_is_null_1(library: str) -> None: - df = nan_dataframe_2(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_is_null_1(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(nan_dataframe_2(library)) result = df.is_null() result_pd = interchange_to_pandas(result, library) if library == "pandas-numpy": @@ -19,8 +24,9 @@ def test_is_null_1(library: str) -> None: pd.testing.assert_frame_equal(result_pd, expected) -def test_is_null_2(library: str) -> None: - df = null_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_is_null_2(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(null_dataframe_1(library)) result = df.is_null() result_pd = interchange_to_pandas(result, library) expected = pd.DataFrame({"a": [False, False, True]}) diff --git a/tests/dataframe/join_test.py b/tests/dataframe/join_test.py index 21ccca3f..53e249ce 100644 --- a/tests/dataframe/join_test.py +++ b/tests/dataframe/join_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd import pytest @@ -8,9 +11,10 @@ from tests.utils import interchange_to_pandas -def test_join_left(library: str) -> None: - left = integer_dataframe_1(library) - right = integer_dataframe_2(library).rename_columns({"b": "c"}) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_join_left(library: str, relax: Callable[[Any], Any]): + left = relax(integer_dataframe_1(library)) + right = relax(integer_dataframe_2(library).rename_columns({"b": "c"})) result = left.join(right, left_on="a", right_on="a", how="left") result_pd = interchange_to_pandas(result, library) expected = pd.DataFrame( diff --git a/tests/dataframe/or_test.py b/tests/dataframe/or_test.py index 80cceb98..efff421d 100644 --- a/tests/dataframe/or_test.py +++ b/tests/dataframe/or_test.py @@ -1,14 +1,19 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd +import pytest from tests.utils import bool_dataframe_1 from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import interchange_to_pandas -def test_or_with_scalar(library: str) -> None: - df = bool_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_or_with_scalar(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(bool_dataframe_1(library)) other = True result = df | other result_pd = interchange_to_pandas(result, library) diff --git a/tests/dataframe/reductions_test.py b/tests/dataframe/reductions_test.py index 24b4212f..642fa0c2 100644 --- a/tests/dataframe/reductions_test.py +++ b/tests/dataframe/reductions_test.py @@ -1,8 +1,12 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd import pytest +from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import integer_dataframe_1 from tests.utils import interchange_to_pandas @@ -20,10 +24,15 @@ ("var", pd.DataFrame({"a": [1.0], "b": [1.0]})), ], ) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) def test_dataframe_reductions( - library: str, reduction: str, expected: pd.DataFrame + library: str, + reduction: str, + expected: pd.DataFrame, + relax: Callable[[Any], Any], ) -> None: - df = integer_dataframe_1(library) + df = relax(integer_dataframe_1(library)) result = getattr(df, reduction)() result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/dataframe/rename_columns_test.py b/tests/dataframe/rename_columns_test.py index 2e803a46..918df009 100644 --- a/tests/dataframe/rename_columns_test.py +++ b/tests/dataframe/rename_columns_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd import pytest @@ -8,8 +11,9 @@ from tests.utils import interchange_to_pandas -def test_rename_columns(library: str) -> None: - df = integer_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_rename_columns(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(integer_dataframe_1(library)) result = df.rename_columns({"a": "c", "b": "e"}) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) @@ -17,8 +21,9 @@ def test_rename_columns(library: str) -> None: pd.testing.assert_frame_equal(result_pd, expected) -def test_rename_columns_invalid(library: str) -> None: - df = integer_dataframe_1(library) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_rename_columns_invalid(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(integer_dataframe_1(library)) with pytest.raises( TypeError, match="Expected Mapping, got: <class 'function'>" ): # pragma: no cover diff --git a/tests/dataframe/schema_test.py b/tests/dataframe/schema_test.py index 8b3e2841..b0200316 100644 --- a/tests/dataframe/schema_test.py +++ b/tests/dataframe/schema_test.py @@ -1,10 +1,16 @@ from __future__ import annotations +from typing import Any +from typing import Callable + +import pytest + from tests.utils import mixed_dataframe_1 -def test_schema(library: str) -> None: - df = mixed_dataframe_1(library) +@pytest.mark.parametrize("maybe_collect", [lambda x: x, lambda x: x.collect()]) +def test_schema(library: str, maybe_collect: Callable[[Any], Any]) -> None: + df = maybe_collect(mixed_dataframe_1(library)) namespace = df.__dataframe_namespace__() result = df.schema assert list(result.keys()) == [ @@ -20,6 +26,11 @@ def test_schema(library: str) -> None: "j", "k", "l", + "m", + "n", + "o", + "p", + "q", ] assert isinstance(result["a"], namespace.Int64) assert isinstance(result["b"], namespace.Int32) @@ -33,3 +44,16 @@ def test_schema(library: str) -> None: assert isinstance(result["j"], namespace.Float32) assert isinstance(result["k"], namespace.Bool) assert isinstance(result["l"], namespace.String) + assert isinstance(result["m"], namespace.Date) + assert ( + isinstance(result["n"], namespace.Datetime) + and result["n"].time_unit == "ms" + and result["n"].time_zone is None + ) + assert ( + isinstance(result["o"], namespace.Datetime) + and result["o"].time_unit == "us" + and result["o"].time_zone is None + ) + assert isinstance(result["p"], namespace.Duration) and result["p"].time_unit == "ms" + assert isinstance(result["q"], namespace.Duration) and result["q"].time_unit == "us" diff --git a/tests/dataframe/shape_test.py b/tests/dataframe/shape_test.py deleted file mode 100644 index 9c74d885..00000000 --- a/tests/dataframe/shape_test.py +++ /dev/null @@ -1,12 +0,0 @@ -from __future__ import annotations - -import pytest - -from tests.utils import integer_dataframe_1 - - -def test_shape(library: str, request: pytest.FixtureRequest) -> None: - if library == "polars-lazy": - request.node.add_marker(pytest.mark.xfail()) - df = integer_dataframe_1(library) - assert df.shape() == (3, 2) diff --git a/tests/dataframe/slice_rows_test.py b/tests/dataframe/slice_rows_test.py index d57f22e4..fd9217b3 100644 --- a/tests/dataframe/slice_rows_test.py +++ b/tests/dataframe/slice_rows_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd import pytest @@ -17,14 +20,16 @@ (2, None, None, pd.DataFrame({"a": [3, 4, 5, 6, 7], "b": [5, 4, 3, 2, 1]})), ], ) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) def test_slice_rows( library: str, start: int | None, stop: int | None, step: int | None, expected: pd.DataFrame, + relax: Callable[[Any], Any], ) -> None: - df = integer_dataframe_3(library) + df = relax(integer_dataframe_3(library)) result = df.slice_rows(start, stop, step) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) diff --git a/tests/dataframe/sort_test.py b/tests/dataframe/sort_test.py index 5e54e1f0..5c3c5189 100644 --- a/tests/dataframe/sort_test.py +++ b/tests/dataframe/sort_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd import pytest @@ -8,23 +11,24 @@ from tests.utils import interchange_to_pandas -@pytest.mark.parametrize("keys", [["a", "b"], None]) -def test_sort( - library: str, - keys: list[str] | None, -) -> None: - df = integer_dataframe_5(library, api_version="2023.09-beta") - result = df.sort(keys=keys) +@pytest.mark.parametrize("keys", [["a", "b"], []]) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_sort(library: str, keys: list[str], relax: Callable[[Any], Any]) -> None: + df = relax(integer_dataframe_5(library, api_version="2023.09-beta")) + result = df.sort(*keys) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"a": [1, 1], "b": [3, 4]}) pd.testing.assert_frame_equal(result_pd, expected) -@pytest.mark.parametrize("keys", [["a", "b"], None]) -def test_sort_descending(library: str, keys: list[str] | None) -> None: - df = integer_dataframe_5(library, api_version="2023.09-beta") - result = df.sort(keys=keys, ascending=False) +@pytest.mark.parametrize("keys", [["a", "b"], []]) +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_sort_descending( + library: str, keys: list[str], relax: Callable[[Any], Any] +) -> None: + df = relax(integer_dataframe_5(library, api_version="2023.09-beta")) + result = df.sort(*keys, ascending=False) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"a": [1, 1], "b": [4, 3]}) diff --git a/tests/dataframe/sorted_indices_test.py b/tests/dataframe/sorted_indices_test.py deleted file mode 100644 index 1e3ffefa..00000000 --- a/tests/dataframe/sorted_indices_test.py +++ /dev/null @@ -1,41 +0,0 @@ -from __future__ import annotations - -import pandas as pd - -from tests.utils import convert_dataframe_to_pandas_numpy -from tests.utils import integer_dataframe_5 -from tests.utils import interchange_to_pandas - - -def test_sorted_indices( - library: str, -) -> None: - df = integer_dataframe_5(library) - sorted_indices = df.sorted_indices(keys=["a", "b"]) - result = df.get_rows(sorted_indices) - result_pd = interchange_to_pandas(result, library) - result_pd = convert_dataframe_to_pandas_numpy(result_pd) - expected = pd.DataFrame({"a": [1, 1], "b": [3, 4]}) - pd.testing.assert_frame_equal(result_pd, expected) - - -def test_sorted_indices_descending( - library: str, -) -> None: - df = integer_dataframe_5(library) - sorted_indices = df.sorted_indices(keys=["a", "b"], ascending=False) - result = df.get_rows(sorted_indices) - result_pd = interchange_to_pandas(result, library) - result_pd = convert_dataframe_to_pandas_numpy(result_pd) - expected = pd.DataFrame({"a": [1, 1], "b": [4, 3]}) - pd.testing.assert_frame_equal(result_pd, expected) - - -def test_sorted_indices_with_insert(library: str) -> None: - df = integer_dataframe_5(library) - result = df.insert(0, "idx", df.sorted_indices()) - result_pd = interchange_to_pandas(result, library) - result_pd = convert_dataframe_to_pandas_numpy(result_pd) - result_pd["idx"] = result_pd["idx"].astype("int64") - expected = pd.DataFrame({"idx": [1, 0], "a": [1, 1], "b": [4, 3]}) - pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/dataframe/to_array_object_test.py b/tests/dataframe/to_array_object_test.py index 13a0df18..b56939fc 100644 --- a/tests/dataframe/to_array_object_test.py +++ b/tests/dataframe/to_array_object_test.py @@ -6,7 +6,7 @@ def test_to_array_object(library: str) -> None: - df = integer_dataframe_1(library) + df = integer_dataframe_1(library).collect() result = np.asarray(df.to_array_object(dtype="int64")) expected = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.int64) np.testing.assert_array_equal(result, expected) diff --git a/tests/dataframe/unique_indices_test.py b/tests/dataframe/unique_indices_test.py deleted file mode 100644 index 3b184ce9..00000000 --- a/tests/dataframe/unique_indices_test.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pytest - -from tests.utils import convert_dataframe_to_pandas_numpy -from tests.utils import integer_dataframe_6 -from tests.utils import interchange_to_pandas - - -@pytest.mark.parametrize( - ("keys", "expected_data"), - [ - (["a", "b"], {"a": [1, 1, 2, 2], "b": [3, 4, 1, 2]}), - (None, {"a": [1, 1, 2, 2], "b": [3, 4, 1, 2]}), - (["a"], {"a": [1, 2], "b": [4, 1]}), - (["b"], {"a": [2, 2, 1, 1], "b": [1, 2, 3, 4]}), - ], -) -def test_unique_indices( - library: str, - keys: list[str] | None, - expected_data: dict[str, list[int]], - request: pytest.FixtureRequest, -) -> None: - df = integer_dataframe_6(library) - if library == "polars-lazy": - # not yet implemented, need to figure this out - request.node.add_marker(pytest.mark.xfail()) - df = df.get_rows(df.unique_indices(keys)) - result = df.get_rows(df.sorted_indices(keys)) - result_pd = interchange_to_pandas(result, library) - result_pd = convert_dataframe_to_pandas_numpy(result_pd) - expected = pd.DataFrame(expected_data) - pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/dataframe/update_columns_test.py b/tests/dataframe/update_columns_test.py new file mode 100644 index 00000000..2b18a651 --- /dev/null +++ b/tests/dataframe/update_columns_test.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from typing import Any +from typing import Callable + +import pandas as pd +import pytest + +from tests.utils import integer_dataframe_1 +from tests.utils import interchange_to_pandas + + +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_update_columns(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(integer_dataframe_1(library)) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = df.assign(col("a") + 1) + result_pd = interchange_to_pandas(result, library) + expected = pd.DataFrame({"a": [2, 3, 4], "b": [4, 5, 6]}) + pd.testing.assert_frame_equal(result_pd, expected) + + +@pytest.mark.parametrize("relax", [lambda x: x, lambda x: x.collect()]) +def test_update_multiple_columns(library: str, relax: Callable[[Any], Any]) -> None: + df = relax(integer_dataframe_1(library)) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = df.assign(col("a") + 1, col("b") + 2) + result_pd = interchange_to_pandas(result, library) + expected = pd.DataFrame({"a": [2, 3, 4], "b": [6, 7, 8]}) + pd.testing.assert_frame_equal(result_pd, expected) + + +def test_update_broadcast(library: str) -> None: + df = integer_dataframe_1(library) + namespace = df.__dataframe_namespace__() + col = namespace.col + result = df.assign(col("a").mean(), col("b") + 2) + result_pd = interchange_to_pandas(result, library) + expected = pd.DataFrame({"a": [2.0, 2.0, 2.0], "b": [6, 7, 8]}) + pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/dataframe/update_test.py b/tests/dataframe/update_test.py index 24812c53..e7d3e2c2 100644 --- a/tests/dataframe/update_test.py +++ b/tests/dataframe/update_test.py @@ -1,7 +1,6 @@ from __future__ import annotations import pandas as pd -import pytest from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import integer_dataframe_1 @@ -11,8 +10,8 @@ def test_update_column(library: str) -> None: df = integer_dataframe_1(library, api_version="2023.09-beta") namespace = df.__dataframe_namespace__() - new_col = namespace.column_from_sequence([7, 8, 9], dtype=namespace.Int64(), name="b") - result = df.update_columns(new_col) + new_col = namespace.col("b") + 3 + result = df.assign(new_col) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"a": [1, 2, 3], "b": [7, 8, 9]}) @@ -27,25 +26,15 @@ def test_update_column(library: str) -> None: def test_update_columns(library: str) -> None: df = integer_dataframe_1(library, api_version="2023.09-beta") namespace = df.__dataframe_namespace__() - new_col_a = namespace.column_from_sequence( - [5, 2, 1], dtype=namespace.Int64(), name="a" - ) - new_col_b = namespace.column_from_sequence( - [7, 8, 9], dtype=namespace.Int64(), name="b" - ) - result = df.update_columns([new_col_a, new_col_b]) + new_col_a = namespace.col("a") + 1 + new_col_b = namespace.col("b") + 3 + result = df.assign(new_col_a, new_col_b) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) - expected = pd.DataFrame({"a": [5, 2, 1], "b": [7, 8, 9]}) + expected = pd.DataFrame({"a": [2, 3, 4], "b": [7, 8, 9]}) pd.testing.assert_frame_equal(result_pd, expected) # check original df didn't change df_pd = interchange_to_pandas(df, library) df_pd = convert_dataframe_to_pandas_numpy(df_pd) expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) pd.testing.assert_frame_equal(df_pd, expected) - - -def test_update_columns_invalid(library: str) -> None: - df = integer_dataframe_1(library, api_version="2023.09-beta") - with pytest.raises(ValueError): - df.update_columns(df.get_column_by_name("a").rename("c")) diff --git a/tests/groupby/groupby_any_all_test.py b/tests/groupby/groupby_any_all_test.py index 935f4dce..3634fde5 100644 --- a/tests/groupby/groupby_any_all_test.py +++ b/tests/groupby/groupby_any_all_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pandas as pd import pytest from polars.exceptions import SchemaError @@ -17,17 +20,19 @@ ("all", [False, True], [False, False]), ], ) -def test_group_by_boolean( +@pytest.mark.parametrize("maybe_collect", [lambda x: x, lambda x: x.collect()]) +def test_groupby_boolean( library: str, aggregation: str, expected_b: list[bool], expected_c: list[bool], - request: pytest.FixtureRequest, + maybe_collect: Callable[[Any], Any], ) -> None: - df = bool_dataframe_2(library) - result = getattr(df.group_by(["key"]), aggregation)() + df = maybe_collect(bool_dataframe_2(library)) + namespace = df.__dataframe_namespace__() + result = getattr(df.group_by("key"), aggregation)() # need to sort - idx = result.sorted_indices(["key"]) + idx = namespace.sorted_indices("key") result = result.get_rows(idx) result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) @@ -38,6 +43,6 @@ def test_group_by_boolean( def test_group_by_invalid_any_all(library: str, request: pytest.FixtureRequest) -> None: df = integer_dataframe_4(library) with pytest.raises((ValueError, SchemaError)): - df.group_by(["key"]).any() + df.group_by("key").any() with pytest.raises((ValueError, SchemaError)): - df.group_by(["key"]).all() + df.group_by("key").all() diff --git a/tests/groupby/invalid_test.py b/tests/groupby/invalid_test.py index 030a7e20..b0e5004a 100644 --- a/tests/groupby/invalid_test.py +++ b/tests/groupby/invalid_test.py @@ -1,15 +1,19 @@ from __future__ import annotations +from typing import Any +from typing import Callable + import pytest from tests.utils import integer_dataframe_1 -def test_group_by_invalid(library: str) -> None: - df = integer_dataframe_1(library).select(["a"]) +@pytest.mark.parametrize("maybe_collect", [lambda x: x, lambda x: x.collect()]) +def test_groupby_invalid(library: str, maybe_collect: Callable[[Any], Any]) -> None: + df = maybe_collect(integer_dataframe_1(library)).select("a") with pytest.raises((KeyError, TypeError)): df.group_by(0) with pytest.raises((KeyError, TypeError)): df.group_by("0") with pytest.raises((KeyError, TypeError)): - df.group_by(["b"]) + df.group_by("b") diff --git a/tests/groupby/numeric_test.py b/tests/groupby/numeric_test.py index dfc81c1c..0907bfb4 100644 --- a/tests/groupby/numeric_test.py +++ b/tests/groupby/numeric_test.py @@ -33,9 +33,8 @@ def test_group_by_numeric( request: pytest.FixtureRequest, ) -> None: df = integer_dataframe_4(library) - result = getattr(df.group_by(["key"]), aggregation)() - sorted_indices = result.sorted_indices(["key"]) - result = result.get_rows(sorted_indices) + result = getattr(df.group_by("key"), aggregation)() + result = result.sort("key") result_pd = interchange_to_pandas(result, library) result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"key": [1, 2], "b": expected_b, "c": expected_c}) diff --git a/tests/groupby/size_test.py b/tests/groupby/size_test.py index 3e326210..a90856c9 100644 --- a/tests/groupby/size_test.py +++ b/tests/groupby/size_test.py @@ -14,10 +14,8 @@ def test_group_by_size(library: str, request: pytest.FixtureRequest) -> None: df = integer_dataframe_4(library) - result = df.group_by(["key"]).size() - # got to sort - idx = result.sorted_indices(["key"]) - result = result.get_rows(idx) + result = df.group_by("key").size() + result = result.sort("key") result_pd = interchange_to_pandas(result, library) expected = pd.DataFrame({"key": [1, 2], "size": [2, 2]}) # TODO polars returns uint32. what do we standardise to? diff --git a/tests/namespace/column_from_1d_array_test.py b/tests/namespace/column_from_1d_array_test.py index a31b7ef5..24863e66 100644 --- a/tests/namespace/column_from_1d_array_test.py +++ b/tests/namespace/column_from_1d_array_test.py @@ -1,5 +1,9 @@ from __future__ import annotations +from datetime import date +from datetime import datetime +from datetime import timedelta + import numpy as np import pandas as pd import pytest @@ -11,6 +15,8 @@ @pytest.mark.parametrize( ("namespace_dtype", "pandas_dtype"), [ + ("Float64", "float64"), + ("Float32", "float32"), ("Int64", "int64"), ("Int32", "int32"), ("Int16", "int16"), @@ -24,7 +30,7 @@ def test_column_from_1d_array( library: str, namespace_dtype: str, pandas_dtype: str ) -> None: - ser = integer_dataframe_1(library).get_column_by_name("a") + ser = integer_dataframe_1(library).collect().get_column_by_name("a") namespace = ser.__column_namespace__() arr = np.array([1, 2, 3]) result = namespace.dataframe_from_dict( @@ -48,7 +54,7 @@ def test_column_from_1d_array( def test_column_from_1d_array_string( library: str, namespace_dtype: str, pandas_dtype: str ) -> None: - ser = integer_dataframe_1(library).get_column_by_name("a") + ser = integer_dataframe_1(library).collect().get_column_by_name("a") namespace = ser.__column_namespace__() arr = np.array(["a", "b", "c"]) result = namespace.dataframe_from_dict( @@ -63,10 +69,66 @@ def test_column_from_1d_array_string( pd.testing.assert_series_equal(result_pd, expected) -def test_column_from_array_invalid(library: str) -> None: - namespace = integer_dataframe_1(library).__dataframe_namespace__() - arr = np.array(["a", "b", "c"]) - with pytest.raises(ValueError): - namespace.column_from_1d_array( - arr, name="result", dtype=namespace.String(), api_version="dfdaf" - ) +@pytest.mark.parametrize( + ("namespace_dtype", "pandas_dtype"), + [ + ("Bool", "bool"), + ], +) +def test_column_from_1d_array_bool( + library: str, namespace_dtype: str, pandas_dtype: str +) -> None: + ser = integer_dataframe_1(library).collect().get_column_by_name("a") + namespace = ser.__column_namespace__() + arr = np.array([True, False, True]) + result = namespace.dataframe_from_dict( + { + "result": namespace.column_from_1d_array( + arr, name="result", dtype=getattr(namespace, namespace_dtype)() + ) + } + ) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series([True, False, True], name="result", dtype=pandas_dtype) + pd.testing.assert_series_equal(result_pd, expected) + + +def test_datetime_from_1d_array(library: str) -> None: + ser = integer_dataframe_1(library).collect().get_column_by_name("a") + namespace = ser.__column_namespace__() + arr = np.array([date(2020, 1, 1), date(2020, 1, 2)], dtype="datetime64[ms]") + result = namespace.dataframe_from_dict( + { + "result": namespace.column_from_1d_array( + arr, name="result", dtype=namespace.Datetime("ms") + ) + } + ) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series( + [datetime(2020, 1, 1), datetime(2020, 1, 2)], + name="result", + dtype="datetime64[ms]", + ) + pd.testing.assert_series_equal(result_pd, expected) + + +def test_duration_from_1d_array(library: str) -> None: + if library == "polars-lazy": + # needs fixing upstream + return + ser = integer_dataframe_1(library).collect().get_column_by_name("a") + namespace = ser.__column_namespace__() + arr = np.array([timedelta(1), timedelta(2)], dtype="timedelta64[ms]") + result = namespace.dataframe_from_dict( + { + "result": namespace.column_from_1d_array( + arr, name="result", dtype=namespace.Duration("ms") + ) + } + ) + result_pd = interchange_to_pandas(result, library)["result"] + expected = pd.Series( + [timedelta(1), timedelta(2)], name="result", dtype="timedelta64[ms]" + ) + pd.testing.assert_series_equal(result_pd, expected) diff --git a/tests/namespace/column_from_sequence_test.py b/tests/namespace/column_from_sequence_test.py index 39656186..e1f51b22 100644 --- a/tests/namespace/column_from_sequence_test.py +++ b/tests/namespace/column_from_sequence_test.py @@ -38,7 +38,8 @@ def test_column_from_sequence( expected: pd.Series[Any], ) -> None: df = integer_dataframe_1(library) - ser = df.get_column_by_name("a") + namespace = df.__dataframe_namespace__() + ser = namespace.col("a") namespace = ser.__column_namespace__() result = namespace.dataframe_from_dict( { diff --git a/tests/namespace/column_names_test.py b/tests/namespace/column_names_test.py index b1333323..6ea39d42 100644 --- a/tests/namespace/column_names_test.py +++ b/tests/namespace/column_names_test.py @@ -1,28 +1,27 @@ from __future__ import annotations -import pytest +from typing import TYPE_CHECKING from tests.utils import integer_dataframe_1 +if TYPE_CHECKING: + import pytest + def test_column_names(library: str, request: pytest.FixtureRequest) -> None: - if library == "polars-lazy": - request.node.add_marker(pytest.mark.xfail()) # nameless column - df = integer_dataframe_1(library) + df = integer_dataframe_1(library).collect() + namespace = df.__dataframe_namespace__() ser = df.get_column_by_name("a") - namespace = ser.__column_namespace__() result = namespace.dataframe_from_dict({"result": ser}) - assert result.column_names == ["result"] - assert result.get_column_by_name("result").name == "result" + assert result.get_column_names() == ["result"] # named column ser = namespace.column_from_sequence( [1, 2, 3], dtype=namespace.Float64(), name="result" ) result = namespace.dataframe_from_dict({"result": ser}) - assert result.column_names == ["result"] - assert result.get_column_by_name("result").name == "result" + assert result.get_column_names() == ["result"] # named column (different name) ser = namespace.column_from_sequence( diff --git a/tests/namespace/dataframe_from_2d_array_test.py b/tests/namespace/dataframe_from_2d_array_test.py index 6b19bf40..fdb7c994 100644 --- a/tests/namespace/dataframe_from_2d_array_test.py +++ b/tests/namespace/dataframe_from_2d_array_test.py @@ -2,7 +2,6 @@ import numpy as np import pandas as pd -import pytest from tests.utils import convert_dataframe_to_pandas_numpy from tests.utils import integer_dataframe_1 @@ -20,16 +19,3 @@ def test_dataframe_from_2d_array(library: str) -> None: result_pd = convert_dataframe_to_pandas_numpy(result_pd) expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) pd.testing.assert_frame_equal(result_pd, expected) - - -def test_dataframe_from_2d_array_invalid_version(library: str) -> None: - df = integer_dataframe_1(library) - namespace = df.__dataframe_namespace__() - arr = np.array([[1, 4], [2, 5], [3, 6]]) - with pytest.raises(ValueError): - namespace.dataframe_from_2d_array( - arr, - names=["a", "b"], - dtypes={"a": namespace.Int64(), "b": namespace.Int64()}, - api_version="123.456", - ) diff --git a/tests/namespace/is_dtype_test.py b/tests/namespace/is_dtype_test.py index e2a323f1..0ee51f3a 100644 --- a/tests/namespace/is_dtype_test.py +++ b/tests/namespace/is_dtype_test.py @@ -19,7 +19,7 @@ ], ) def test_is_dtype(library: str, dtype: str, expected: list[str]) -> None: - df = mixed_dataframe_1(library) + df = mixed_dataframe_1(library).collect() namespace = df.__dataframe_namespace__() result = [ i diff --git a/tests/namespace/sorted_indices_test.py b/tests/namespace/sorted_indices_test.py new file mode 100644 index 00000000..3efa64ff --- /dev/null +++ b/tests/namespace/sorted_indices_test.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import pandas as pd + +from tests.utils import convert_dataframe_to_pandas_numpy +from tests.utils import integer_dataframe_6 +from tests.utils import interchange_to_pandas + + +def test_column_sorted_indices_ascending(library: str) -> None: + df = integer_dataframe_6(library) + namespace = df.__dataframe_namespace__() + sorted_indices = namespace.sorted_indices("b") + result = df.assign(sorted_indices.rename("result")) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected_1 = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "result": [3, 4, 2, 0, 1], + } + ) + expected_2 = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "result": [3, 4, 2, 1, 0], + } + ) + if library in ("polars", "polars-lazy"): + result_pd["result"] = result_pd["result"].astype("int64") + try: + pd.testing.assert_frame_equal(result_pd, expected_1) + except AssertionError: # pragma: no cover + # order isn't determinist, so try both + pd.testing.assert_frame_equal(result_pd, expected_2) + + +def test_column_sorted_indices_descending(library: str) -> None: + df = integer_dataframe_6(library) + namespace = df.__dataframe_namespace__() + sorted_indices = namespace.sorted_indices("b", ascending=False) + result = df.assign(sorted_indices.rename("result")) + result_pd = interchange_to_pandas(result, library) + result_pd = convert_dataframe_to_pandas_numpy(result_pd) + expected_1 = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "result": [1, 0, 2, 4, 3], + } + ) + expected_2 = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "result": [0, 1, 2, 4, 3], + } + ) + if library in ("polars", "polars-lazy"): + result_pd["result"] = result_pd["result"].astype("int64") + try: + pd.testing.assert_frame_equal(result_pd, expected_1) + except AssertionError: + # order isn't determinist, so try both + pd.testing.assert_frame_equal(result_pd, expected_2) diff --git a/tests/namespace/to_array_object_test.py b/tests/namespace/to_array_object_test.py index 447738cc..c7eb0404 100644 --- a/tests/namespace/to_array_object_test.py +++ b/tests/namespace/to_array_object_test.py @@ -1,24 +1,19 @@ from __future__ import annotations import numpy as np -import pytest from tests.utils import integer_dataframe_1 def test_to_array_object(library: str) -> None: - df = integer_dataframe_1(library) + df = integer_dataframe_1(library).collect() result = np.asarray(df.to_array_object(dtype="int64")) expected = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.int64) np.testing.assert_array_equal(result, expected) def test_column_to_array_object(library: str) -> None: - col = integer_dataframe_1(library).get_column_by_name("a") - if library == "polars-lazy": - with pytest.raises(NotImplementedError): - col.to_array_object(dtype="int64") - return + col = integer_dataframe_1(library).collect().get_column_by_name("a") result = np.asarray(col.to_array_object(dtype="int64")) result = np.asarray(col.to_array_object(dtype="int64")) expected = np.array([1, 2, 3], dtype=np.int64) diff --git a/tests/utils.py b/tests/utils.py index 55788338..81435c9c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,7 +1,12 @@ from __future__ import annotations +from datetime import date +from datetime import datetime +from datetime import timedelta from typing import Any from typing import cast +from typing import TYPE_CHECKING +from typing import TypeVar import pandas as pd import polars as pl @@ -10,10 +15,37 @@ import dataframe_api_compat.pandas_standard import dataframe_api_compat.polars_standard +DType = TypeVar("DType") + +if TYPE_CHECKING: + from dataframe_api import ( + Bool, + PermissiveColumn, + Column, + DataFrame, + GroupBy, + ) +else: + + class DataFrame: + ... + + class PermissiveColumn: + ... + + class Column: + ... + + class GroupBy: + ... + + class Bool: + ... + def convert_to_standard_compliant_dataframe( df: pd.DataFrame | pl.DataFrame, api_version: str | None = None -) -> Any: +) -> DataFrame: # todo: type return if isinstance(df, pd.DataFrame): return ( @@ -66,7 +98,7 @@ def convert_dataframe_to_pandas_numpy(df: pd.DataFrame) -> pd.DataFrame: return df -def integer_dataframe_1(library: str, api_version: str | None = None) -> Any: +def integer_dataframe_1(library: str, api_version: str | None = None) -> DataFrame: df: Any if library == "pandas-numpy": df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype="int64") @@ -80,7 +112,7 @@ def integer_dataframe_1(library: str, api_version: str | None = None) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def integer_dataframe_2(library: str) -> Any: +def integer_dataframe_2(library) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame({"a": [1, 2, 4], "b": [4, 2, 6]}, dtype="int64") @@ -94,7 +126,7 @@ def integer_dataframe_2(library: str) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def integer_dataframe_3(library: str) -> Any: +def integer_dataframe_3(library) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame( @@ -112,7 +144,7 @@ def integer_dataframe_3(library: str) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def integer_dataframe_4(library: str) -> Any: +def integer_dataframe_4(library) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame( @@ -158,7 +190,7 @@ def integer_dataframe_6(library: str, api_version: str | None = None) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def integer_dataframe_7(library: str) -> Any: +def integer_dataframe_7(library) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 4]}, dtype="int64") @@ -172,7 +204,7 @@ def integer_dataframe_7(library: str) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def nan_dataframe_1(library: str) -> Any: +def nan_dataframe_1(library) -> DataFrame: df: Any if library == "pandas-numpy": df = pd.DataFrame({"a": [1.0, 2.0, float("nan")]}, dtype="float64") @@ -187,7 +219,7 @@ def nan_dataframe_1(library: str) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def nan_dataframe_2(library: str) -> Any: +def nan_dataframe_2(library) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame({"a": [0.0, 1.0, float("nan")]}, dtype="float64") @@ -202,7 +234,7 @@ def nan_dataframe_2(library: str) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def null_dataframe_1(library: str) -> Any: +def null_dataframe_1(library) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame({"a": [1.0, 2.0, float("nan")]}, dtype="float64") @@ -216,7 +248,7 @@ def null_dataframe_1(library: str) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def null_dataframe_2(library: str) -> Any: +def null_dataframe_2(library) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame( @@ -235,25 +267,25 @@ def null_dataframe_2(library: str) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def bool_dataframe_1(library: str) -> Any: +def bool_dataframe_1(library: str, api_version="2023.09-beta") -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame( {"a": [True, True, False], "b": [True, True, True]}, dtype="bool" ) - return convert_to_standard_compliant_dataframe(df) + return convert_to_standard_compliant_dataframe(df, api_version=api_version) if library == "pandas-nullable": df = pd.DataFrame( {"a": [True, True, False], "b": [True, True, True]}, dtype="boolean" ) - return convert_to_standard_compliant_dataframe(df) + return convert_to_standard_compliant_dataframe(df, api_version=api_version) if library == "polars-lazy": df = pl.LazyFrame({"a": [True, True, False], "b": [True, True, True]}) - return convert_to_standard_compliant_dataframe(df) + return convert_to_standard_compliant_dataframe(df, api_version=api_version) raise AssertionError(f"Got unexpected library: {library}") -def bool_dataframe_2(library: str) -> Any: +def bool_dataframe_2(library) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame( @@ -285,7 +317,7 @@ def bool_dataframe_2(library: str) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def bool_dataframe_3(library: str) -> Any: +def bool_dataframe_3(library) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame( @@ -303,7 +335,7 @@ def bool_dataframe_3(library: str) -> Any: raise AssertionError(f"Got unexpected library: {library}") -def integer_series_1(library: str) -> Any: +def integer_series_1(library) -> Any: ser: Any if library == "pandas-numpy": ser = pd.Series([1, 2, 3]) @@ -321,10 +353,14 @@ def integer_series_5(library: str, request: pytest.FixtureRequest) -> Any: df: Any if library == "pandas-numpy": df = pd.DataFrame({"a": [1, 1, 4]}, dtype="int64") - return convert_to_standard_compliant_dataframe(df).get_column_by_name("a") + return ( + convert_to_standard_compliant_dataframe(df).collect().get_column_by_name("a") + ) if library == "pandas-nullable": df = pd.DataFrame({"a": [1, 1, 4]}, dtype="Int64") - return convert_to_standard_compliant_dataframe(df).get_column_by_name("a") + return ( + convert_to_standard_compliant_dataframe(df).collect().get_column_by_name("a") + ) if library == "polars-lazy": request.node.add_marker(pytest.mark.xfail()) raise AssertionError(f"Got unexpected library: {library}") @@ -373,14 +409,13 @@ def float_dataframe_3(library: str, request: pytest.FixtureRequest) -> object: raise AssertionError(f"Got unexpected library: {library}") -def bool_series_1(library: str) -> Any: - df: Any +def bool_series_1(library) -> Any: if library == "pandas-numpy": - df = pd.DataFrame({"a": [True, False, True]}, dtype="bool") - return convert_to_standard_compliant_dataframe(df).get_column_by_name("a") + ser = pd.Series([True, False, True], name="a", dtype="bool") + return convert_to_standard_compliant_column(ser) if library == "pandas-nullable": - df = pd.DataFrame({"a": [True, False, True]}, dtype="boolean") - return convert_to_standard_compliant_dataframe(df).get_column_by_name("a") + ser = pd.Series([True, False, True], name="a", dtype="boolean") + return convert_to_standard_compliant_column(ser) if library == "polars-lazy": ser = pl.Series("a", [True, False, True]) return convert_to_standard_compliant_column(ser) @@ -388,22 +423,21 @@ def bool_series_1(library: str) -> Any: def interchange_to_pandas(result: Any, library: str) -> pd.DataFrame: - df = ( - result.dataframe.collect() - if library in ("polars", "polars-lazy") - else result.dataframe - ) + if isinstance(result.dataframe, pl.LazyFrame): + df = result.dataframe.collect() + else: + df = result.dataframe df = pd.api.interchange.from_dataframe(df) df = convert_dataframe_to_pandas_numpy(df) return cast(pd.DataFrame, df) -def maybe_collect(result: Any, library: str) -> Any: - df = result.dataframe.collect() if library == "polars-lazy" else result.dataframe +def maybe_collect(result: Any) -> Any: + df = result.collect().dataframe if hasattr(result, "collect") else result.dataframe return df -def mixed_dataframe_1(library: str) -> Any: +def mixed_dataframe_1(library) -> Any: df: Any data = { "a": [1, 2, 3], @@ -418,6 +452,11 @@ def mixed_dataframe_1(library: str) -> Any: "j": [1.0, 2.0, 3.0], "k": [True, False, True], "l": ["a", "b", "c"], + "m": [date(2020, 1, 1), date(2020, 1, 2), date(2020, 1, 3)], + "n": [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "o": [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "p": [timedelta(days=1), timedelta(days=2), timedelta(days=3)], + "q": [timedelta(days=1), timedelta(days=2), timedelta(days=3)], } if library == "pandas-numpy": df = pd.DataFrame(data).astype( @@ -434,6 +473,11 @@ def mixed_dataframe_1(library: str) -> Any: "j": "float32", "k": "bool", "l": "object", + "m": "datetime64[s]", + "n": "datetime64[ms]", + "o": "datetime64[us]", + "p": "timedelta64[ms]", + "q": "timedelta64[us]", } ) return convert_to_standard_compliant_dataframe(df) @@ -452,6 +496,11 @@ def mixed_dataframe_1(library: str) -> Any: "j": "Float32", "k": "bool", "l": "string[pyarrow]", + "m": "datetime64[s]", + "n": "datetime64[ms]", + "o": "datetime64[us]", + "p": "timedelta64[ms]", + "q": "timedelta64[us]", } ) return convert_to_standard_compliant_dataframe(df) @@ -471,6 +520,11 @@ def mixed_dataframe_1(library: str) -> Any: "j": pl.Float32, "k": pl.Boolean, "l": pl.Utf8, + "m": pl.Date, + "n": pl.Datetime("ms"), + "o": pl.Datetime("us"), + "p": pl.Duration("ms"), + "q": pl.Duration("us"), }, ) return convert_to_standard_compliant_dataframe(df)