diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 68ea8eaac3e..94b788aea47 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -430,6 +430,7 @@ jobs: if: matrix.engine == 'python' || matrix.test_task == 'group_4' - run: python -m pytest modin/tests/interchange/dataframe_protocol/pandas/test_protocol.py if: matrix.engine == 'python' || matrix.test_task == 'group_4' + - run: python -m pytest modin/tests/polars/test_dataframe.py - run: | python -m pip install lazy_import python -m pytest modin/tests/pandas/integrations/ diff --git a/environment-dev.yml b/environment-dev.yml index 3ea51032bde..049b3e39830 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -70,3 +70,4 @@ dependencies: - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.6.0 + - polars diff --git a/modin/polars/__init__.py b/modin/polars/__init__.py new file mode 100644 index 00000000000..3407698eb64 --- /dev/null +++ b/modin/polars/__init__.py @@ -0,0 +1,17 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from modin.polars.dataframe import DataFrame +from modin.polars.series import Series + +__all__ = ["DataFrame", "Series"] diff --git a/modin/polars/base.py b/modin/polars/base.py new file mode 100644 index 00000000000..010ee9e946c --- /dev/null +++ b/modin/polars/base.py @@ -0,0 +1,668 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Implement DataFrame/Series public API as polars does.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Sequence + +import polars + +from modin.core.storage_formats import BaseQueryCompiler + +if TYPE_CHECKING: + import numpy as np + + from modin.polars import DataFrame, Series + + +class BasePolarsDataset: + + _query_compiler: BaseQueryCompiler + + @property + def __constructor__(self): + """ + DataFrame constructor. + + Returns: + Constructor of the DataFrame + """ + return type(self) + + def __eq__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.eq( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __ne__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.ne( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __add__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.add( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __sub__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.sub( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __mul__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.mul( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __truediv__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.truediv( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __floordiv__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.floordiv( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __mod__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.mod( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __pow__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.pow( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __and__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.__and__( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __or__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.__or__( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __xor__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.__xor__( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __lt__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.lt( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __le__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.le( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __gt__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.gt( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __ge__(self, other) -> "BasePolarsDataset": + return self.__constructor__( + _query_compiler=self._query_compiler.ge( + other._query_compiler if isinstance(other, BasePolarsDataset) else other + ) + ) + + def __invert__(self) -> "BasePolarsDataset": + return self.__constructor__(_query_compiler=self._query_compiler.invert()) + + def __neg__(self) -> "BasePolarsDataset": + return self.__constructor__(_query_compiler=self._query_compiler.negative()) + + def __abs__(self) -> "BasePolarsDataset": + return self.__constructor__(_query_compiler=self._query_compiler.abs()) + + def is_duplicated(self): + """ + Determine whether each row is a duplicate in the DataFrame. + + Returns: + DataFrame with True for each duplicate row, and False for unique rows. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.duplicated(keep=False) + ) + + def is_empty(self) -> bool: + """ + Determine whether the DataFrame is empty. + + Returns: + True if the DataFrame is empty, False otherwise + """ + return self.height == 0 + + def is_unique(self): + """ + Determine whether each row is unique in the DataFrame. + + Returns: + DataFrame with True for each unique row, and False for duplicate rows. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.duplicated(keep=False).invert() + ) + + def n_chunks(self, strategy: str = "first") -> int | list[int]: + raise NotImplementedError("not yet") + + def to_arrow(self): + """ + Convert the DataFrame to Arrow format. + + Returns: + Arrow representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).to_arrow() + + def to_jax(self, device=None): + """ + Convert the DataFrame to JAX format. + + Args: + device: The device to use. + + Returns: + JAX representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).to_jax( + device=device + ) + + def to_numpy( + self, + *, + writable: bool = False, + allow_copy: bool = True, + use_pyarrow: bool | None = None, + zero_copy_only: bool | None = None, + ) -> "np.ndarray": + """ + Convert the DataFrame to a NumPy representation. + + Args: + writable: Whether the NumPy array should be writable. + allow_copy: Whether to allow copying the data. + use_pyarrow: Whether to use PyArrow for conversion. + zero_copy_only: Whether to use zero-copy conversion only. + + Returns: + NumPy representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).to_numpy( + writable=writable, + allow_copy=allow_copy, + use_pyarrow=use_pyarrow, + zero_copy_only=zero_copy_only, + ) + + def to_torch(self): + """ + Convert the DataFrame to PyTorch format. + + Returns: + PyTorch representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).to_torch() + + def bottom_k( + self, + k: int, + *, + by, + descending: bool | Sequence[bool] = False, + nulls_last: bool | Sequence[bool] | None = None, + maintain_order: bool | None = None, + ) -> "BasePolarsDataset": + raise NotImplementedError("not yet") + + def cast(self, dtypes, *, strict: bool = True) -> "BasePolarsDataset": + """ + Cast the DataFrame to the given dtypes. + + Args: + dtypes: Dtypes to cast the DataFrame to. + strict: Whether to enforce strict casting. + + Returns: + DataFrame with the new dtypes. + """ + # TODO: support strict + return self.__constructor__(_query_compiler=self._query_compiler.astype(dtypes)) + + def clone(self) -> "BasePolarsDataset": + """ + Clone the DataFrame. + + Returns: + Cloned DataFrame. + """ + return self.copy() + + def drop_nulls(self, subset=None): + """ + Drop the rows with null values. + + Args: + subset: Columns to consider for null values. + + Returns: + DataFrame with the rows with null values dropped. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.dropna(subset=subset, how="any") + ) + + def explode(self, columns: str, *more_columns: str) -> "BasePolarsDataset": + """ + Explode the given columns to long format. + + Args: + columns: Columns to explode. + more_columns: Additional columns to explode. + + Returns: + DataFrame with the columns exploded. + """ + if len(more_columns) > 0: + columns = [columns, *more_columns] + return self.__constructor__( + _query_compiler=self._query_compiler.explode(columns) + ) + + def extend(self, other: "BasePolarsDataset") -> "BasePolarsDataset": + """ + Extend the DataFrame with another DataFrame. + + Args: + other: DataFrame to extend with. + + Returns: + Extended DataFrame for convenience. DataFrame is modified in place. + """ + self._query_compiler = self._query_compiler.concat( + axis=0, other=other._query_compiler + ) + return self + + def fill_nan(self, value): + """ + Fill NaN values with the given value. + + Args: + value: Value to fill NaN values with. + + Returns: + DataFrame with NaN values filled. + """ + # TODO: Handle null values differently than nan. + return self.__constructor__(_query_compiler=self._query_compiler.fillna(value)) + + def fill_null( + self, + value: Any | None = None, + strategy: str | None = None, + limit: int | None = None, + *, + matches_supertype: bool = True, + ) -> "BasePolarsDataset": + """ + Fill null values with the given value or strategy. + + Args: + value: Value to fill null values with. + strategy: Strategy to fill null values with. + limit: Maximum number of null values to fill. + matches_supertype: Whether the value matches the supertype. + + Returns: + DataFrame with null values filled. + """ + if strategy == "forward": + strategy = "ffill" + elif strategy == "backward": + strategy = "bfill" + elif strategy in ["min", "max", "mean"]: + value = getattr(self, strategy)()._query_compiler + strategy = None + elif strategy == "zero": + strategy = None + value = 0 + elif strategy == "one": + strategy = None + value = 1 + else: + raise ValueError(f"Unknown strategy: {strategy}") + return self.__constructor__( + _query_compiler=self._query_compiler.fillna( + value=value, method=strategy, limit=limit + ) + ) + + def filter(self, *predicates, **constraints: Any) -> "BasePolarsDataset": + predicates = predicates[0] + for p in predicates[1:]: + predicates = predicates & p + if constraints: + raise NotImplementedError("Named constraints are not supported") + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_array( + predicates._query_compiler + ) + ) + + def gather_every(self, n: int, offset: int = 0) -> "BasePolarsDataset": + """ + Gather every nth row of the DataFrame. + + Args: + n: Number of rows to gather. + offset: Offset to start gathering from. + + Returns: + DataFrame with every nth row gathered. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array( + slice(offset, None, n) + ) + ) + + def head(self, n: int = 5) -> "BasePolarsDataset": + """ + Get the first n rows of the DataFrame. + + Args: + n: Number of rows to get. + + Returns: + DataFrame with the first n rows. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array(slice(0, n)) + ) + + def limit(self, n: int = 10) -> "BasePolarsDataset": + """ + Limit the DataFrame to the first n rows. + + Args: + n: Number of rows to limit to. + + Returns: + DataFrame with the first n rows. + """ + return self.head(n) + + def interpolate(self) -> "BasePolarsDataset": + """ + Interpolate values the DataFrame using a linear method. + + Returns: + DataFrame with the interpolated values. + """ + return self.__constructor__(_query_compiler=self._query_compiler.interpolate()) + + def sample( + self, + n: int | "Series" | None = None, + *, + fraction: float | "Series" | None = None, + with_replacement: bool = False, + shuffle: bool = False, + seed: int | None = None, + ) -> "BasePolarsDataset": + """ + Sample the DataFrame. + + Args: + n: Number of rows to sample. + fraction: Fraction of rows to sample. + with_replacement: Whether to sample with replacement. + shuffle: Whether to shuffle the rows. + seed: Seed for the random number generator. + + Returns: + Sampled DataFrame. + """ + return self.__constructor__( + _query_compiler=self.to_pandas() + .sample(n=n, frac=fraction, replace=with_replacement, random_state=seed) + ._query_compiler + ) + + def shift(self, n: int = 1, *, fill_value=None) -> "DataFrame": + raise NotImplementedError("not yet") + + def shrink_to_fit(self) -> "DataFrame": + """ + Shrink the DataFrame to fit in memory. + + Returns: + A copy of the DataFrame. + """ + return self.copy() + + def slice(self, offset: int, length: int) -> "DataFrame": + """ + Slice the DataFrame. + + Args: + offset: Offset to start the slice from. + length: Length of the slice. + + Returns: + Sliced DataFrame. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array( + slice(offset, offset + length) + ) + ) + + def sort( + self, + by, + *more_by, + descending: bool | Sequence[bool] = False, + nulls_last: bool | Sequence[bool] | None = None, + multithreaded: bool = True, + maintain_order: bool = False, + ) -> "DataFrame": + """ + Sort the DataFrame. + + Args: + by: Column to sort by. + more_by: Additional columns to sort by. + descending: Whether to sort in descending order. + nulls_last: Whether to sort null values last. + multithreaded: Whether to use multiple threads. + maintain_order: Whether to maintain the order of the DataFrame. + + Returns: + Sorted DataFrame. + """ + # TODO: support expressions in by + if len(more_by) > 0: + by = [by, *more_by] + return self.__constructor__( + _query_compiler=self._query_compiler.sort_rows_by_column_values( + by=by, + reverse=descending, + nulls_first=None if nulls_last is None else not nulls_last, + ) + ) + + def tail(self, n: int = 5) -> "DataFrame": + """ + Get the last n rows of the DataFrame. + + Args: + n: Number of rows to get. + + Returns: + DataFrame with the last n rows. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array(slice(-n, None)) + ) + + def to_dummies( + self, + columns: str | Sequence[str] | None = None, + *, + separator: str = "_", + drop_first: bool = False, + ) -> "DataFrame": + """ + Convert the columns to dummy variables. + + Args: + columns: Columns to convert to dummy variables. + separator: Separator for the dummy variables. + drop_first: Whether to drop the first dummy variable. + + Returns: + DataFrame with the columns converted to dummy variables. + """ + if columns is not None: + if isinstance(columns, str): + columns = [columns] + else: + columns = self.columns + result = self.__constructor__( + _query_compiler=self._query_compiler.get_dummies(columns) + ) + if separator != "_": + result.columns = [ + c.replace(separator, "_") if separator in c else c + for c in result.columns + ] + if drop_first: + columns_to_drop = [ + next( + result_col + for result_col in result.columns + if result_col.startswith(c) + ) + for c in columns + ] + return result.drop(columns_to_drop) + else: + return result + + def top_k( + self, + k: int, + *, + by, + descending: bool | Sequence[bool] = False, + nulls_last: bool | Sequence[bool] | None = None, + maintain_order: bool | None = None, + ) -> "DataFrame": + raise NotImplementedError("not yet") + + def unique(self, subset=None, *, keep="any", maintain_order: bool = False): + """ + Get the unique values in each column. + + Args: + subset: Columns to consider for unique values. + keep: Strategy to keep unique values. + maintain_order: Whether to maintain the order of the unique values. + + Returns: + DataFrame with the unique values in each column. + """ + if keep == "none" or keep == "last": + # TODO: support keep="none" + raise NotImplementedError("not yet") + return self.__constructor__( + _query_compiler=self._query_compiler.unique(subset=subset) + ) + + def equals(self, other: "BasePolarsDataset", *, null_equal: bool = True) -> bool: + """ + Determine whether the DataFrame is equal to another DataFrame. + + Args: + other: DataFrame to compare with. + + Returns: + True if the DataFrames are equal, False otherwise. + """ + return ( + isinstance(other, type(self)) + and self._query_compiler.equals(other._query_compiler) + and ( + null_equal + or ( + not self.to_pandas().isna().any(axis=None) + and not other.to_pandas().isna().any(axis=None) + ) + ) + ) + + @property + def plot(self): + return polars.from_pandas(self._query_compiler.to_pandas()).plot + + def count(self): + """ + Get the number of non-null values in each column. + + Returns: + DataFrame with the counts. + """ + return self.__constructor__(_query_compiler=self._query_compiler.count(axis=0)) diff --git a/modin/polars/dataframe.py b/modin/polars/dataframe.py new file mode 100644 index 00000000000..d4408ff39f0 --- /dev/null +++ b/modin/polars/dataframe.py @@ -0,0 +1,1439 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Module houses ``DataFrame`` class, that is distributed version of ``polars.DataFrame``.""" + +from __future__ import annotations + +from collections import OrderedDict +from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Sequence + +import numpy as np +import pandas +import polars +from pandas.core.dtypes.common import is_list_like + +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +from modin.pandas import DataFrame as ModinPandasDataFrame +from modin.pandas import Series as ModinPandasSeries +from modin.pandas.io import from_pandas +from modin.polars.base import BasePolarsDataset + +if TYPE_CHECKING: + from modin.polars import Series + from modin.polars.groupby import GroupBy + from modin.polars.lazyframe import LazyFrame + + +class DataFrame(BasePolarsDataset): + + def __init__( + self, + data=None, + schema=None, + *, + schema_overrides=None, + strict=True, + orient=None, + infer_schema_length=100, + nan_to_null=False, + _query_compiler=None, + ) -> None: + """ + Constructor for DataFrame object. + + Args: + data: Data to be converted to DataFrame. + schema: Schema of the data. + schema_overrides: Schema overrides. + strict: Whether to enforce strict schema. + orient: Orientation of the data. + infer_schema_length: Length of the data to infer schema. + nan_to_null: Whether to convert NaNs to nulls. + _query_compiler: Query compiler to use. + """ + if _query_compiler is None: + if isinstance(data, (ModinPandasDataFrame, ModinPandasSeries)): + self._query_compiler: BaseQueryCompiler = data._query_compiler.copy() + else: + self._query_compiler: BaseQueryCompiler = from_pandas( + polars.DataFrame( + data=data, + schema=schema, + schema_overrides=schema_overrides, + strict=strict, + orient=orient, + infer_schema_length=infer_schema_length, + nan_to_null=nan_to_null, + ).to_pandas() + )._query_compiler + else: + self._query_compiler: BaseQueryCompiler = _query_compiler + + def __getitem__(self, item): + """ + Get item from DataFrame. + + Args: + item: Column to get. + + Returns: + Series or DataFrame with the column. + """ + if is_list_like(item): + missing = [i for i in item if i not in self.columns] + if len(missing) > 0: + raise polars.exceptions.ColumnNotFoundError(missing[0]) + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_array(item) + ) + else: + if item not in self.columns: + raise polars.exceptions.ColumnNotFoundError(item) + from .series import Series + + return Series(_query_compiler=self._query_compiler.getitem_array([item])) + + def _to_polars(self) -> polars.DataFrame: + """ + Convert the DataFrame to Polars format. + + Returns: + Polars representation of the DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()) + + def _get_columns(self): + """ + Get columns of the DataFrame. + + Returns: + List of columns. + """ + return list(self._query_compiler.columns) + + def _set_columns(self, new_columns): + """ + Set columns of the DataFrame. + + Args: + new_columns: New columns to set. + """ + new_query_compiler = self._query_compiler.copy() + new_query_compiler.columns = new_columns + self._query_compiler = new_query_compiler + + columns = property(_get_columns, _set_columns) + + _sorted_columns_cache = None + + def _get_sorted_columns(self): + if self._sorted_columns_cache is None: + self._sorted_columns_cache = [False] * len(self.columns) + return self._sorted_columns_cache + + def _set_sorted_columns(self, value): + self._sorted_columns_cache = value + + _sorted_columns = property(_get_sorted_columns, _set_sorted_columns) + + @property + def dtypes(self): + """ + Get dtypes of the DataFrame. + + Returns: + List of dtypes. + """ + return polars.from_pandas( + pandas.DataFrame(columns=self.columns).astype(self._query_compiler.dtypes) + ).dtypes + + @property + def flags(self): + """ + Get flags of the DataFrame. + + Returns: + List of flags. + """ + # TODO: Add flags support + return [] + + @property + def height(self): + """ + Get height of the DataFrame. + + Returns: + Number of rows in the DataFrame. + """ + return len(self._query_compiler.index) + + @property + def schema(self): + """ + Get schema of the DataFrame. + + Returns: + OrderedDict of column names and dtypes. + """ + return OrderedDict(zip(self.columns, self.dtypes, strict=True)) + + @property + def shape(self): + """ + Get shape of the DataFrame. + + Returns: + Tuple of (height, width + """ + return self.height, self.width + + @property + def width(self): + """ + Get width of the DataFrame. + + Returns: + Number of columns in the DataFrame. + """ + return len(self.columns) + + def __repr__(self): + """ + Get string representation of the DataFrame. + + Returns: + String representation of the DataFrame. + """ + return repr(polars.from_pandas(self._query_compiler.to_pandas())) + + def max(self, axis=None): + """ + Get the maximum value in each column. + + Args: + axis: Axis to get the maximum value on. + + Returns: + DataFrame with the maximum values. + """ + if axis is None or axis == 0: + return self.__constructor__( + _query_compiler=self._query_compiler.max(axis=0) + ) + else: + return self.max_horizontal() + + def max_horizontal(self): + """ + Get the maximum value in each row. + + Returns: + DataFrame with the maximum values. + """ + return self.__constructor__(_query_compiler=self._query_compiler.max(axis=1)) + + def _convert_non_numeric_to_null(self): + """ + Convert non-numeric columns to null. + + Returns: + DataFrame with non-numeric columns converted to null. + """ + non_numeric_cols = [ + c + for c, t in zip(self.columns, self.dtypes, strict=True) + if not t.is_numeric() + ] + if len(non_numeric_cols) > 0: + return self.__constructor__( + _query_compiler=self._query_compiler.write_items( + slice(None), + [self.columns.index(c) for c in non_numeric_cols], + pandas.NA, + need_columns_reindex=False, + ).astype({c: self._query_compiler.dtypes[c] for c in non_numeric_cols}) + ) + return self.copy() + + def mean(self, *, axis=None, null_strategy="ignore"): + """ + Get the mean of each column. + + Args: + axis: Axis to get the mean on. + null_strategy: Strategy to handle null values. + + Returns: + DataFrame with the mean of each column or row. + """ + # TODO: this converts non numeric columns to numeric + obj = self._convert_non_numeric_to_null() + if axis is None or axis == 0: + return self.__constructor__( + _query_compiler=obj._query_compiler.mean( + axis=0, + skipna=True if null_strategy == "ignore" else False, + ) + ) + else: + return obj.mean_horizontal( + ignore_nulls=True if null_strategy == "ignore" else False + ) + + def median(self) -> "DataFrame": + """ + Get the median of each column. + + Returns: + DataFrame with the median of each column. + """ + return self.__constructor__( + _query_compiler=self._convert_non_numeric_to_null()._query_compiler.median( + 0 + ) + ) + + def mean_horizontal(self, *, ignore_nulls: bool = True): + """ + Get the mean of each row. + + Args: + ignore_nulls: Whether to ignore null values. + + Returns: + DataFrame with the mean of each row. + """ + obj = self._convert_non_numeric_to_null() + return self.__constructor__( + _query_compiler=obj._query_compiler.mean(axis=1, skipna=ignore_nulls) + ) + + def min(self, axis=None): + """ + Get the minimum value in each column. + + Args: + axis: Axis to get the minimum value on. + + Returns: + DataFrame with the minimum values of each row or column. + """ + if axis is None or axis == 0: + return self.__constructor__( + _query_compiler=self._query_compiler.min(axis=0) + ) + else: + return self.max_horizontal() + + def min_horizontal(self): + """ + Get the minimum value in each row. + + Returns: + DataFrame with the minimum values of each row. + """ + return self.__constructor__(_query_compiler=self._query_compiler.min(axis=1)) + + def product(self): + """ + Get the product of each column. + + Returns: + DataFrame with the product of each column. + """ + obj = self._convert_non_numeric_to_null() + return self.__constructor__(_query_compiler=obj._query_compiler.prod(axis=0)) + + def quantile(self, quantile: float, interpolation="nearest"): + """ + Get the quantile of each column. + + Args: + quantile: Quantile to get. + interpolation: Interpolation method. + + Returns: + DataFrame with the quantile of each column. + """ + obj = self._convert_non_numeric_to_null() + # TODO: interpolation support + return self.__constructor__( + _query_compiler=obj._query_compiler.quantile_for_single_value(quantile) + ) + + def std(self, ddof: int = 1): + """ + Get the standard deviation of each column. + + Args: + ddof: Delta degrees of freedom. + + Returns: + DataFrame with the standard deviation of each column + """ + obj = self._convert_non_numeric_to_null() + return self.__constructor__(_query_compiler=obj._query_compiler.std(ddof=ddof)) + + def sum(self, axis: int | None = None, null_strategy="ignore"): + """ + Get the sum of each column. + + Args: + axis: Axis to get the sum on. + null_strategy: Strategy to handle null values. + + Returns: + DataFrame with the sum of each column or row. + """ + obj = self._convert_non_numeric_to_null() + if axis is None or axis == 0: + return self.__constructor__( + _query_compiler=obj._query_compiler.sum( + axis=0, + skipna=True if null_strategy == "ignore" else False, + ) + ) + else: + return obj.sum_horizontal( + ignore_nulls=True if null_strategy == "ignore" else False + ) + + def sum_horizontal(self, *, ignore_nulls: bool = True): + """ + Get the sum of each row. + + Args: + ignore_nulls: Whether to ignore null values. + + Returns: + DataFrame with the sum of each row. + """ + # TODO: if there are strings in the row, polars will append numeric values + # this behavior may not be intended so doing this instead (for now) + obj = self._convert_non_numeric_to_null() + return self.__constructor__( + _query_compiler=obj._query_compiler.sum(axis=1, skipna=ignore_nulls) + ) + + def var(self, ddof: int = 1): + """ + Get the variance of each column. + + Args: + ddof: Delta degrees of freedom. + + Returns: + DataFrame with the variance of each column. + """ + obj = self._convert_non_numeric_to_null() + return self.__constructor__(_query_compiler=obj._query_compiler.var(ddof=ddof)) + + def approx_n_unique(self): + """ + Get the approximate number of unique values in each column. + + Returns: + DataFrame with the approximate number of unique values in each column. + """ + return self.__constructor__(_query_compiler=self._query_compiler.nunique()) + + def describe(self, percentiles: Sequence[float] | float = (0.25, 0.5, 0.75)): + """ + Get the descriptive statistics of each column. + + Args: + percentiles: Percentiles to get. + + Returns: + DataFrame with the descriptive statistics of each column. + """ + return self.__constructor__( + self.__constructor__( + _query_compiler=self._query_compiler.describe( + percentiles=np.array(percentiles) + ).astype( + { + k: str + for k, v in zip(self.columns, self.dtypes, strict=True) + if v == polars.String + } + ) + ) + .to_pandas() + .loc[ + [ + "count", + # "null_count", TODO: support null_count in describe + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ] + ] + .reset_index() + .rename({"index": "statistic"}) + ) + + def estimated_size(self, unit="b"): + """ + Get the estimated amount of memory used by the DataFrame. + + Args: + unit: Unit of the memory size. + + Returns: + DataFrame with the extimated memory usage. + """ + return self.__constructor__(_query_compiler=self._query_compiler.memory_usage()) + + def glimpse( + self, + *, + max_items_per_column: int = 10, + max_colname_length: int = 50, + return_as_string: bool = False, + ) -> str | None: + raise NotImplementedError("not yet") + + def n_unique(self, subset=None) -> int: + """ + Get the number of unique values in each column. + + Args: + subset: Columns to get the number of unique values for. + + Returns: + Number of unique values in each column. + """ + if subset is not None: + raise NotImplementedError("not yet") + return ( + self.is_unique()._query_compiler.sum(axis=0).to_pandas().squeeze(axis=None) + ) + + def null_count(self) -> "DataFrame": + """ + Get the number of null values in each column. + + Returns: + DataFrame with the number of null values in each column. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.isna().sum(axis=0) + ) + + def to_pandas(self): + """ + Convert the DataFrame to Pandas format. + + Returns: + modin.pandas representation of the DataFrame. + """ + return ModinPandasDataFrame(query_compiler=self._query_compiler.copy()) + + def group_by( + self, + *by, + maintain_order: bool = False, + **named_by, + ) -> "GroupBy": + """ + Group the DataFrame by the given columns. + + Args: + by: Columns to group by. + maintain_order: Whether to maintain the order of the groups. + named_by: Named columns to group by. + + Returns: + GroupBy object. + """ + from modin.polars.groupby import GroupBy + + return GroupBy(self, *by, maintain_order=maintain_order, **named_by) + + def drop(self, *columns, strict: bool = True) -> "DataFrame": + """ + Drop the given columns. + + Args: + columns: Columns to drop. + strict: Whether to raise an error if a column is not found. + + Returns: + DataFrame with the columns dropped. + """ + if strict: + for c in columns: + if c not in self.columns: + raise KeyError(c) + columns = list(columns) if not isinstance(columns[0], list) else columns[0] + return self.__constructor__(_query_compiler=self._query_compiler.drop(columns)) + + def drop_in_place(self, name: str) -> "DataFrame": + """ + Drop the given column in place and return the dropped column. + + Args: + name: Column to drop. + + Returns: + The column that was dropped from the DataFrame. + """ + col_to_return = self[name] + self._query_compiler = self._query_compiler.drop([name]) + return col_to_return + + def get_column(self, name: str) -> "Series": + """ + Get the column by name. + + Args: + name: Name of the column to get. + + Returns: + Series with the column. + """ + return self[name] + + def get_column_index(self, name: str) -> int: + """ + Find the index of the column by name. + + Args: + name: Name of the column to find. + + Returns: + Index of the column. + """ + return self.columns.index(name) + + def get_columns(self) -> list["Series"]: + """ + Get the columns of the DataFrame. + + Returns: + List of Series with the columns. + """ + return [self[name] for name in self.columns] + + def group_by_dynamic( + self, + index_column, + *, + every, + period, + offset, + truncate, + include_boundaries, + closed, + label, + group_by, + start_by, + check_sorted, + ): + raise NotImplementedError("not yet") + + def hstack(self, columns, *, inplace: bool = False) -> "DataFrame": + """ + Stack the given columns horizontally. + + Args: + columns: Columns to stack. + inplace: Whether to stack the columns in place. + + Returns: + DataFrame with the columns stacked horizontally. + """ + if isinstance(columns, DataFrame): + columns = columns.get_columns() + result_query_compiler = self._query_compiler.concat( + axis=1, other=[c._query_compiler for c in columns] + ) + if inplace: + self._query_compiler = result_query_compiler + return self + return self.__constructor__(_query_compiler=result_query_compiler) + + def insert_column(self, index: int, column: "Series") -> "DataFrame": + """ + Insert the given column at the given index. + + Args: + index: Index to insert the column at. + column: Column to insert. + name: Name of the column to insert. + + Returns: + DataFrame with the column inserted. + """ + return self.__constructor__( + self._query_compiler.insert(index, column.name, column._query_compiler) + ) + + def item(self, row: int | None = None, column: str | int | None = None) -> Any: + """ + Get the value at the given row and column. + + Args: + row: Row to get the value from. + column: Column to get the value from. + + Returns: + Value at the given row and column. + """ + if row is None: + row = 0 + if column is None: + column = 0 + if isinstance(column, str): + column = self.columns.index(column) + return ( + self._query_compiler.take_2d_labels(row, column) + .to_pandas() + .squeeze(axis=None) + ) + + def iter_columns(self) -> Iterator["Series"]: + """ + Iterate over the columns of the DataFrame. + + Returns: + Iterator over the columns. + """ + return iter(self.get_columns()) + + def iter_rows( + self, + *, + named: bool = False, + buffer_size: int = 512, + ) -> Iterator[tuple[Any]] | Iterator[dict[str, Any]]: + """ + Iterate over the rows of the DataFrame. + + Returns: + Iterator over the rows. + """ + raise NotImplementedError("not yet") + + def iter_slices( + self, + n_rows: int = 10000, + ) -> Iterator["DataFrame"]: + """ + Iterate over the slices of the DataFrame. + + Args: + n_rows: Number of rows in each slice. + + Returns: + Iterator over the slices. + """ + raise NotImplementedError("not yet") + + def join( + self, + other: "DataFrame", + on: str | list[str] | None = None, + how: str = "inner", + *, + left_on: str | list[str] | None = None, + right_on: str | list[str] | None = None, + suffix: str = "_right", + validate="m:m", + join_nulls: bool = False, + coalesce: bool | None = None, + ) -> "DataFrame": + """ + Join the DataFrame with another DataFrame. + + Args: + other: DataFrame to join with. + on: Column to join on. + how: How to join the DataFrames. + + Returns: + Joined DataFrame. + """ + if how == "full": + how = "outer" + elif how == "cross": + raise NotImplementedError("not yet") + elif how == "semi": + how = "right" + elif how == "anti": + raise NotImplementedError("not yet") + return self.__constructor__( + _query_compiler=self._query_compiler.merge( + other._query_compiler, + on=on, + how=how, + suffixes=("", suffix), + left_on=left_on, + right_on=right_on, + ) + ) + + def join_asof( + self, + other: "DataFrame", + *, + left_on: str | None = None, + right_on: str | None = None, + on: str | None = None, + by_left: str | Sequence[str] | None = None, + by_right: str | Sequence[str] | None = None, + by: str | Sequence[str] | None = None, + strategy: str = "backward", + suffix: str = "_right", + tolerance: str, + ) -> "DataFrame": + """ + Join the DataFrame with another DataFrame using asof logic. + + Args: + other: DataFrame to join with. + left_on: Column to join on in the left DataFrame. + right_on: Column to join on in the right DataFrame. + on: Column to join on in both DataFrames. + by_left: Columns to join on in the left DataFrame. + by_right: Columns to join on in the right DataFrame. + by: Columns to join on in both DataFrames. + strategy: Strategy to use for the join. + suffix: Suffix to add to the columns. + tolerance: Tolerance for the join. + + Returns: + Joined DataFrame. + """ + if on is not None and left_on is None and right_on is None: + left_on = right_on = on + if by is not None and by_left is None and by_right is None: + by_left = by_right = by + return self.__constructor__( + _query_compiler=self._query_compiler.merge_asof( + other._query_compiler, + left_on=left_on, + right_on=right_on, + left_by=by_left, + right_by=by_right, + direction=strategy, + suffixes=("", suffix), + tolerance=tolerance, + ) + ) + + def melt( + self, + id_vars=None, + value_vars=None, + variable_name: str | None = None, + value_name: str | None = None, + ) -> "DataFrame": + """ + Melt the DataFrame. + + Args: + id_vars: Columns to keep. + value_vars: Columns to melt. + variable_name: Name of the variable column. + value_name: Name of the value column. + + Returns: + Melted DataFrame. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.melt( + id_vars=id_vars, + value_vars=value_vars, + var_name=variable_name, + value_name=value_name, + ) + ) + + def merge_sorted(self, other: "DataFrame", on: str | list[str]) -> "DataFrame": + # TODO: support natural join + sort + raise NotImplementedError("not yet") + + def partition_by( + self, + by, + *more_by, + maintain_order: bool = True, + include_key: bool = True, + as_dict: bool = False, + ) -> list["DataFrame"] | dict[Any, "DataFrame"]: + """ + Partition the DataFrame by the given columns. + + Args: + by: Columns to partition by. + more_by: Additional columns to partition by. + maintain_order: Whether to maintain the order of the partitions. + include_key: Whether to include the partition key. + as_dict: Whether to return the partitions as a dictionary. + + Returns: + List of DataFrames or dictionary of DataFrames. + """ + if isinstance(by, str): + by = [by, *more_by] + elif isinstance(by, list): + by = [*by, *more_by] + if as_dict: + return { + k: self.__constructor__(v) + for k, v in self.to_pandas() + .groupby(by, as_index=not include_key) + .groups + } + else: + return [ + self.__constructor__(g) + for g in self.to_pandas().groupby(by, as_index=not include_key) + ] + + def pipe(self, function, *args, **kwargs) -> Any: + return function(self, *args, **kwargs) + + def pivot( + self, + *, + values, + index, + columns, + aggregate_function=None, + maintain_order: bool = True, + sort_columns: bool = False, + separator: str = "_", + ) -> "DataFrame": + """ + Pivot the DataFrame. + + Args: + values: Values to pivot. + index: Index columns. + columns: Columns to pivot. + aggregate_function: Function to aggregate the values. + maintain_order: Whether to maintain the order of the pivot. + sort_columns: Whether to sort the columns. + separator: Separator for the columns. + + Returns: + Pivoted DataFrame. + """ + # TODO: handle maintain_order, sort_columns, separator + return self.__constructor__( + _query_compiler=self._query_compiler.pivot( + values=values, + index=index, + columns=columns, + agg=aggregate_function, + ) + ) + + def rechunk(self) -> "DataFrame": + """ + Rechunk the DataFrame into the given number of partitions. + + Returns: + Rechunked DataFrame. + """ + return self.copy() + + def rename(self, mapping: dict[str, str] | callable) -> "DataFrame": + """ + Rename the columns of the DataFrame. + + Args: + mapping: Mapping of old names to new names. + + Returns: + DataFrame with the columns renamed. + """ + if callable(mapping): + mapping = {c: mapping(c) for c in self.columns} + # TODO: add a query compiler method for `rename` + new_columns = {c: mapping.get(c, c) for c in self.columns} + new_obj = self.copy() + new_obj.columns = new_columns + return new_obj + + def replace_column(self, index: int, column: "Series") -> "DataFrame": + """ + Replace the column at the given index with the new column. + + Args: + index: Index of the column to replace. + column: New column to replace with. + + Returns: + DataFrame with the column replaced. + """ + self._query_compiler = self._query_compiler.drop([self.columns[index]]).insert( + index, + column.name, + column._query_compiler, + ) + return self + + def reverse(self) -> "DataFrame": + """ + Reverse the DataFrame. + + Returns: + Reversed DataFrame. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.getitem_row_array( + slice(None, None, -1) + ) + ) + + def rolling(self, index_column, *, period, offset, closed, group_by, check_sorted): + raise NotImplementedError("not yet") + + def row( + self, index: int | None = None, *, by_predicate=None, named: bool = False + ) -> tuple[Any] | dict[str, Any]: + """ + Get the row at the given index. + + Args: + index: Index of the row to get. + by_predicate: Predicate to get the row by. + named: Whether to return the row as a dictionary. + + Returns: + Row at the given index. + """ + if index is not None: + if named: + return dict(self.to_pandas().iloc[index]) + else: + return tuple(self.to_pandas().iloc[index]) + else: + # TODO: support expressions + raise NotImplementedError("not yet") + + def rows(self, *, named: bool = False) -> list[tuple[Any]] | list[dict[str, Any]]: + raise NotImplementedError("not yet") + + def rows_by_key( + self, + key: Any, + *, + named: bool = False, + include_key: bool = False, + unique: bool = False, + ) -> dict[Any, Iterable[Any]]: + raise NotImplementedError("not yet") + + def select(self, *exprs, **named_exprs) -> "DataFrame": + # TODO: support expressions + raise NotImplementedError("not yet") + + def select_seq(self, *exprs, **named_exprs) -> "DataFrame": + # TODO: support expressions + raise NotImplementedError("not yet") + + def set_sorted( + self, column: str | Iterable[str], *more_columns: str, descending: bool = False + ) -> "DataFrame": + """ + Set the columns to be sorted. + + Args: + column: Column to sort by. + more_columns: Additional columns to sort by. + descending: Whether to sort in descending order. + + Returns: + DataFrame with the columns sorted. + """ + if len(more_columns) > 0: + if isinstance(column, Iterable): + column = [*column, *more_columns] + else: + column = [column, *more_columns] + if isinstance(column, str): + column = [column] + new_sorted_columns = [c in column for c in self.columns] + obj = self.copy() + obj._sorted_columns = new_sorted_columns + return obj + + def sql(self, query: str, *, table_name: str = "self") -> "DataFrame": + raise NotImplementedError("not yet") + + def to_series(self, index: int = 0) -> "Series": + """ + Convert the DataFrame at index provided to a Series. + + Args: + index: Index of the column to convert to a Series. + + Returns: + Series representation of the DataFrame at index provided. + """ + return self[self.columns[index]] + + def transpose( + self, + *, + include_header: bool = False, + header_name: str = "column", + column_names: str | Sequence[str] | None = None, + ) -> "DataFrame": + """ + Transpose the DataFrame. + + Args: + include_header: Whether to include a header. + header_name: Name of the header. + column_names: Names of the columns. + + Returns: + Transposed DataFrame. + """ + result = self.__constructor__(_query_compiler=self._query_compiler.transpose()) + if column_names is not None: + result.columns = column_names + elif include_header: + result.columns = [f"{header_name}_{i}" for i in range(result.width)] + return result + + def unnest(self, columns, *more_columns) -> "DataFrame": + """ + Unnest the given columns. + + Args: + columns: Columns to unnest. + more_columns: Additional columns to unnest. + + Returns: + DataFrame with the columns unnested. + """ + raise NotImplementedError("not yet") + + def unstack( + self, + step: int, + how: str = "vertical", + columns=None, + fill_values: list[Any] | None = None, + ): + """ + Unstack the DataFrame. + + Args: + step: Step to unstack by. + how: How to unstack the DataFrame. + columns: Columns to unstack. + fill_values: Values to fill the unstacked DataFrame with. + + Returns: + Unstacked DataFrame. + """ + raise NotImplementedError("not yet") + + def update( + self, + other: "DataFrame", + on: str | Sequence[str] | None = None, + how: Literal["left", "inner", "full"] = "left", + *, + left_on: str | Sequence[str] | None = None, + right_on: str | Sequence[str] | None = None, + include_nulls: bool = False, + ) -> "DataFrame": + """ + Update the DataFrame with another DataFrame. + + Args: + other: DataFrame to update with. + on: Column to update on. + how: How to update the DataFrame. + + Returns: + Updated DataFrame. + """ + raise NotImplementedError("not yet") + + def upsample( + self, + time_column: str, + *, + every: str, + offset: str | None = None, + group_by: str | Sequence[str] | None = None, + maintain_order: bool = False, + ) -> "DataFrame": + raise NotImplementedError("not yet") + + def vstack(self, other: "DataFrame", *, in_place: bool = False) -> "DataFrame": + """ + Stack the given DataFrame vertically. + + Args: + other: DataFrame to stack. + in_place: Whether to stack the DataFrames in place. + + Returns: + Stacked DataFrame. + """ + if in_place: + self._query_compiler = self._query_compiler.concat( + axis=0, other=other._query_compiler + ) + return self + else: + return self.__constructor__( + _query_compiler=self._query_compiler.concat( + axis=0, other=other._query_compiler + ) + ) + + def with_columns(self, *exprs, **named_exprs) -> "DataFrame": + # TODO: support expressions + raise NotImplementedError("not yet") + + def with_columns_seq(self, *exprs, **named_exprs) -> "DataFrame": + # TODO: support expressions + raise NotImplementedError("not yet") + + def with_row_index(self, name: str = "index", offset: int = 0) -> "DataFrame": + """ + Add a row index to the DataFrame. + + Args: + name: Name of the row index. + offset: Offset for the row index. + + Returns: + DataFrame with the row index added. + """ + if offset != 0: + obj = self.copy() + obj.index = obj.index + offset + result = self.__constructor__( + _query_compiler=self._query_compiler.reset_index(drop=False) + ) + result.columns = [name, *self.columns] + return result + + with_row_count = with_row_index + + def map_rows( + self, function: callable, return_dtype=None, *, inference_size: int = 256 + ) -> "DataFrame": + """ + Apply the given function to the DataFrame. + + Args: + function: Function to apply. + return_dtype: Return type of the function. + inference_size: Size of the inference. + + Returns: + DataFrame with the function applied. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.apply(function, axis=1) + ) + + def corr(self, **kwargs: Any) -> "DataFrame": + """ + Compute the correlation of the DataFrame. + + Returns: + DataFrame with the correlation. + """ + return self.__constructor__(_query_compiler=self._query_compiler.corr(**kwargs)) + + def lazy(self) -> "LazyFrame": + """ + Convert the DataFrame to a lazy DataFrame. + + Returns: + Lazy DataFrame. + """ + raise NotImplementedError("not yet") + + @classmethod + def deserialize(cls, source) -> "DataFrame": + """ + Deserialize the DataFrame. + + Args: + source: Source to deserialize. + + Returns: + Deserialized DataFrame. + """ + return cls(polars.DataFrame.deserialize(source)) + + def serialize(self, file=None) -> str | None: + """ + Serialize the DataFrame. + + Args: + file: File to serialize to. + + Returns: + Serialized DataFrame. + """ + return polars.from_pandas(self._query_compiler.to_pandas()).serialize(file) + + @property + def style(self): + """ + Create a Great Table for styling. + + Returns: + GreatTable object. + """ + return self._to_polars().style + + def to_dict( + self, *, as_series: bool = True + ) -> dict[str, "Series"] | dict[str, list[Any]]: + """ + Convert the DataFrame to a dictionary representation. + + Args: + as_series: Whether to convert the columns to Series. + + Returns: + Dictionary representation of the DataFrame. + """ + if as_series: + return {name: self[name] for name in self.columns} + else: + return polars.from_pandas(self._query_compiler.to_pandas()).to_dict( + as_series=as_series + ) + + def to_dicts(self) -> list[dict[str, Any]]: + """ + Convert the DataFrame to a list of dictionaries. + + Returns: + List of dictionaries. + """ + return self._to_polars().to_dicts() + + def to_init_repr(self, n: int = 1000) -> str: + """ + Get the string representation of the DataFrame for initialization. + + Returns: + String representation of the DataFrame for initialization. + """ + return self._to_polars().to_init_repr(n) + + def to_struct(self, name: str = "") -> "Series": + """ + Convert the DataFrame to a struct. + + Args: + name: Name of the struct. + + Returns: + Series representation of the DataFrame as a struct. + """ + raise NotImplementedError("not yet") + + def unpivot( + self, + on, + *, + index, + variable_name: str | None = None, + value_name: str | None = None, + ) -> "DataFrame": + """ + Unpivot a DataFrame from wide to long format. + + Args: + on: Columns to unpivot. + index: Columns to keep. + variable_name: Name of the variable column. + value_name: Name of the value column. + + Returns: + Unpivoted DataFrame. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.melt( + on=on, + index=index, + var_name=variable_name, + value_name=value_name, + ) + ) + + write_avro = write_clipboard = write_csv = write_database = write_delta = ( + write_excel + ) = write_ipc = write_ipc_stream = write_json = write_ndjson = write_parquet = ( + write_parquet_partitioned + ) = lambda *args, **kwargs: (_ for _ in ()).throw(NotImplementedError("not yet")) + + def clear(self, n: int = 0) -> "DataFrame": + """ + Create an empty (n=0) or null filled (n>0) DataFrame. + + Args: + n: Number of rows to create. + + Returns: + Empty or null filled DataFrame. + """ + return self.__constructor__(polars.DataFrame(schema=self.schema).clear(n=n)) + + def collect_schema(self) -> dict[str, str]: + """ + Collect the schema of the DataFrame. + + Returns: + Dictionary of the schema. + """ + return self.schema + + def fold(self, operation: callable) -> "Series": + """ + Fold the DataFrame. + + Args: + operation: Operation to fold the DataFrame with. + + Returns: + Series with the folded DataFrame. + """ + raise NotImplementedError("not yet") + + def hash_rows( + self, + seed: int = 0, + seed_1: int | None = None, + seed_2: int | None = None, + seed_3: int | None = None, + ) -> "Series": + raise NotImplementedError("not yet") diff --git a/modin/polars/groupby.py b/modin/polars/groupby.py new file mode 100644 index 00000000000..ec6305a4b2b --- /dev/null +++ b/modin/polars/groupby.py @@ -0,0 +1,247 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Implement GroupBy public API as pandas does.""" + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from modin.polars import DataFrame + + +class GroupBy: + + def __init__( + self, + df: "DataFrame", + *by, + maintain_order: bool = False, + **named_by, + ) -> None: + self.df = df + if len(by) == 1: + self.by = by[0] + else: + if all(isinstance(b, str) and b in self.df.columns for b in by): + self.by = self.df[list(by)]._query_compiler + elif all(isinstance(b, type(self._df._query_compiler)) for b in by): + self.by = by + else: + raise NotImplementedError("not yet") + self.named_by = named_by + self.maintain_order = maintain_order + + def agg(self, *aggs, **named_aggs): + raise NotImplementedError("not yet") + + def all(self): + raise NotImplementedError("not yet") + + def map_groups(self, function) -> "DataFrame": + raise NotImplementedError("not yet") + + apply = map_groups + + def count(self): + return self.len(name="count") + + def first(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_first( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ).reset_index(drop=False) + ) + + def head(self, n: int = 5): + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_head( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs=dict(n=n), + drop=False, + ) + ) + + def last(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_last( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ).reset_index(drop=False) + ) + + def len(self, name: str | None = None) -> "DataFrame": + if name is None: + name = "len" + result = self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_size( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ) + ) + result._query_compiler.columns = [ + c if c != "size" else name for c in result.columns + ] + return result + + def max(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_max( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ) + ) + + def mean(self) -> "DataFrame": + # TODO: Non numeric columns are dropped, but in Polars they are converted to null + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_mean( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs=dict(numeric_only=True), + drop=False, + ).reset_index(drop=False) + ) + + def median(self) -> "DataFrame": + # TODO: Non numeric columns are dropped, but in Polars they are converted to null + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_median( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs=dict(numeric_only=True), + drop=False, + ).reset_index(drop=False) + ) + + def min(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_min( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ) + ) + + def n_unique(self) -> "DataFrame": + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_nunique( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs={}, + drop=False, + ) + ) + + def quantile(self, quantile: float, interpolation="nearest") -> "DataFrame": + # TODO: Non numeric columns are dropped, but in Polars they are converted to null + # TODO: interpolation types not yet supported + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_quantile( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs=dict(numeric_only=True, q=quantile), + drop=False, + ).reset_index(drop=False) + ) + + def sum(self) -> "DataFrame": + # TODO: Non numeric columns are dropped, but in Polars they are converted to null + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_sum( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=True, + ), + agg_args=(), + agg_kwargs=dict(numeric_only=True), + drop=False, + ).reset_index(drop=False) + ) + + def tail(self, n: int = 5): + return self.df.__constructor__( + _query_compiler=self.df._query_compiler.groupby_tail( + self.by, + axis=0, + groupby_kwargs=dict( + sort=not self.maintain_order, + as_index=False, + ), + agg_args=(), + agg_kwargs=dict(n=n), + drop=False, + ) + ) diff --git a/modin/polars/lazyframe.py b/modin/polars/lazyframe.py new file mode 100644 index 00000000000..8616b6ae15c --- /dev/null +++ b/modin/polars/lazyframe.py @@ -0,0 +1,22 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from modin.polars.base import BasePolarsDataset + + +class LazyFrame(BasePolarsDataset): + """ + Stub for Lazy Frame implementation. + """ + + pass diff --git a/modin/polars/series.py b/modin/polars/series.py new file mode 100644 index 00000000000..8db757908c9 --- /dev/null +++ b/modin/polars/series.py @@ -0,0 +1,2159 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Module houses `Series` class, that is distributed version of `polars.Series`.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Sequence + +import numpy as np +import pandas +import polars +from polars._utils.various import no_default + +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +from modin.error_message import ErrorMessage +from modin.pandas import Series as ModinPandasSeries +from modin.pandas.io import from_pandas +from modin.polars.base import BasePolarsDataset + +if TYPE_CHECKING: + from numpy.typing import ArrayLike + from polars import PolarsDataType + + from modin.polars import DataFrame + + +class Series(BasePolarsDataset): + def __init__( + self, + name: str | "ArrayLike" | None = None, + values: "ArrayLike" | None = None, + dtype: "PolarsDataType | None" = None, + *, + strict: "bool" = True, + nan_to_null: "bool" = False, + dtype_if_empty: "PolarsDataType" = polars.Null, + _query_compiler: BaseQueryCompiler | None = None, + ) -> None: + if _query_compiler is None: + if isinstance(values, ModinPandasSeries): + self._query_compiler = values._query_compiler.copy() + else: + self._query_compiler: BaseQueryCompiler = from_pandas( + polars.Series( + name=name, + values=values, + dtype=dtype, + strict=strict, + nan_to_null=nan_to_null, + dtype_if_empty=dtype_if_empty, + ) + .to_pandas() + .to_frame() + )._query_compiler + else: + self._query_compiler: BaseQueryCompiler = _query_compiler + + def __repr__(self): + return repr( + polars.from_pandas(self._query_compiler.to_pandas().squeeze(axis=1)) + ) + + _sorted = False + _descending = None + + def to_pandas(self) -> ModinPandasSeries: + return ModinPandasSeries(query_compiler=self._query_compiler) + + def arg_max(self) -> int: + """ + Get the index of the maximum value. + + Returns: + Index of the maximum value. + """ + return self.to_pandas().argmax() + + def arg_min(self) -> int: + """ + Get the index of the minimum value. + + Returns: + Index of the minimum value. + """ + return self.to_pandas().argmin() + + def implode(self) -> "Series": + """ + Aggregate values into a list. + + Returns: + Imploded Series. + """ + raise NotImplementedError("not yet") + + def max(self) -> Any: + """ + Get the maximum value. + + Returns: + Maximum value. + """ + return self.to_pandas().max() + + def min(self) -> Any: + """ + Get the minimum value. + + Returns: + Minimum value. + """ + return self.to_pandas().min() + + def mean(self) -> Any: + """ + Get the mean value. + + Returns: + Mean value. + """ + return self.to_pandas().mean() + + def median(self) -> Any: + """ + Get the median value. + + Returns: + Median value. + """ + return self.to_pandas().median() + + def mode(self) -> Any: + """ + Get the mode value. + + Returns: + Mode value. + """ + return self.to_pandas().mode() + + def nan_max(self) -> Any: + """ + Get the maximum value, ignoring NaN values. + + Returns: + Maximum value. + """ + return self.to_pandas().max(skipna=True) + + def nan_min(self) -> Any: + """ + Get the minimum value, ignoring NaN values. + + Returns: + Minimum value. + """ + return self.to_pandas().min(skipna=True) + + def product(self) -> Any: + """ + Get the product of all values. + + Returns: + Product of all values. + """ + return self.to_pandas().product() + + def quantile(self, quantile: float, interpolation: str = "nearest") -> float | None: + """ + Get the quantile value. + + Args: + quantile: Quantile to calculate. + interpolation: Interpolation method. + + Returns: + Quantile value. + """ + return self.to_pandas().quantile(quantile, interpolation=interpolation) + + def std(self, ddof: int = 1) -> float: + """ + Get the standard deviation. + + Args: + ddof: Delta Degrees of Freedom. + + Returns: + Standard deviation. + """ + return self.to_pandas().std(ddof=ddof) + + def sum(self) -> Any: + """ + Get the sum of all values. + + Returns: + Sum of all values. + """ + return self.to_pandas().sum() + + def var(self, ddof: int = 1) -> float: + """ + Get the variance. + + Args: + ddof: Delta Degrees of Freedom. + + Returns: + Variance. + """ + return self.to_pandas().var(ddof=ddof) + + @property + def arr(self) -> polars.series.array.ArrayNameSpace: + """ + Get the underlying array. + + Returns: + Underlying array. + """ + return polars.from_pandas(self._query_compiler.to_pandas().squeeze(axis=1)).arr + + @property + def dtype(self) -> polars.datatypes.DataType: + """ + Get the data type. + + Returns: + Data type. + """ + return polars.from_pandas( + pandas.Series().astype(self._query_compiler.dtypes.iloc[0]) + ).dtype + + @property + def name(self) -> str: + """ + Get the name. + + Returns: + Name. + """ + return self._query_compiler.columns[0] + + @property + def shape(self) -> tuple[int]: + """ + Get the shape. + + Returns: + Shape. + """ + return (len(self._query_compiler.index),) + + flags = [] + + @property + def bin(self): + raise NotImplementedError("not yet") + + def all(self) -> bool: + """ + Check if all values are True. + + Returns: + True if all values are True, False otherwise. + """ + return self.to_pandas().all() + + def any(self) -> bool: + """ + Check if any value is True. + + Returns: + True if any value is True, False otherwise. + """ + return self.to_pandas().any() + + def not_(self) -> "Series": + """ + Negate the values. + + Returns: + Negated Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.invert()) + + @property + def cat(self): + raise NotImplementedError("not yet") + + def abs(self) -> "Series": + """ + Get the absolute values. + + Returns: + Absolute values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.abs()) + + def arccos(self) -> "Series": + """ + Get the arc cosine values. + + Returns: + Arc cosine values Series. + """ + raise NotImplementedError("not yet") + + def arccosh(self) -> "Series": + """ + Get the hyperbolic arc cosine values. + + Returns: + Hyperbolic arc cosine values Series. + """ + raise NotImplementedError("not yet") + + def arcsin(self) -> "Series": + """ + Get the arc sine values. + + Returns: + Arc sine values Series. + """ + raise NotImplementedError("not yet") + + def arcsinh(self) -> "Series": + """ + Get the hyperbolic arc sine values. + + Returns: + Hyperbolic arc sine values Series. + """ + raise NotImplementedError("not yet") + + def arctan(self) -> "Series": + """ + Get the arc tangent values. + + Returns: + Arc tangent values Series. + """ + raise NotImplementedError("not yet") + + def arctanh(self) -> "Series": + """ + Get the hyperbolic arc tangent values. + + Returns: + Hyperbolic arc tangent values Series. + """ + raise NotImplementedError("not yet") + + def arg_true(self) -> "Series": + """ + Get the index of the first True value. + + Returns: + Index of the first True value. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.reset_index(drop=False) + .getitem_array(self._query_compiler) + .getitem_column_array(0, numeric=True) + ).rename(self.name) + + def arg_unique(self) -> "Series": + """ + Get the index of the first unique value. + + Returns: + Index of the first unique value. + """ + raise NotImplementedError("not yet") + + def cbrt(self) -> "Series": + """ + Get the cube root values. + + Returns: + Cube root values Series. + """ + raise NotImplementedError("not yet") + + def cos(self) -> "Series": + """ + Get the cosine values. + + Returns: + Cosine values Series. + """ + raise NotImplementedError("not yet") + + def cosh(self) -> "Series": + """ + Get the hyperbolic cosine values. + + Returns: + Hyperbolic cosine values Series. + """ + raise NotImplementedError("not yet") + + def cot(self) -> "Series": + """ + Get the cotangent values. + + Returns: + Cotangent values Series. + """ + raise NotImplementedError("not yet") + + def cum_count(self) -> "Series": + """ + Get the cumulative count values. + + Returns: + Cumulative count values Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.isna().cumsum() + ) + + def cum_max(self) -> "Series": + """ + Get the cumulative maximum values. + + Returns: + Cumulative maximum values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.cummax()) + + def cum_min(self) -> "Series": + """ + Get the cumulative minimum values. + + Returns: + Cumulative minimum values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.cummin()) + + def cum_prod(self) -> "Series": + """ + Get the cumulative product values. + + Returns: + Cumulative product values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.cumprod()) + + def cum_sum(self) -> "Series": + """ + Get the cumulative sum values. + + Returns: + Cumulative sum values Series. + """ + return self.__constructor__(_query_compiler=self._query_compiler.cumsum()) + + def cumulative_eval( + self, expr, min_periods: int = 1, *, parallel: bool = False + ) -> "Series": + """ + Get the cumulative evaluation values. + + Args: + expr: Expression to evaluate. + min_periods: Minimum number of periods. + + Returns: + Cumulative evaluation values Series. + """ + raise NotImplementedError("not yet") + + def diff(self, n: int = 1, null_behavior: str = "ignore") -> "Series": + """ + Calculate the first discrete difference between shifted items. + + Args: + n: Number of periods to shift. + null_behavior: Null behavior. + + Returns: + Difference values Series. + """ + raise NotImplementedError("not yet") + + def dot(self, other) -> int | float | None: + """ + Calculate the dot product. + + Args: + other: Other Series. + + Returns: + Dot product. + """ + if isinstance(other, Series): + other = other.to_pandas() + return self.to_pandas().dot(other) + + def entropy( + self, base: float = 2.718281828459045, *, normalize: bool = False + ) -> float: + """ + Calculate the entropy. + + Args: + base: Logarithm base. + normalize: Normalize the entropy. + + Returns: + Entropy. + """ + raise NotImplementedError("not yet") + + def ewm_mean( + self, + com: int | None = None, + span: int | None = None, + half_life: int | None = None, + alpha: float | None = None, + *, + adjust: bool = True, + min_periods: int = 1, + ignore_nulls: bool | None = None, + ) -> "Series": + """ + Calculate the exponential weighted mean. + + Args: + com: Center of mass. + span: Span. + + Returns: + Exponential weighted mean Series. + """ + return self.__constructor__( + self.to_pandas() + .ewm( + com=com, + span=span, + halflife=half_life, + alpha=alpha, + adjust=adjust, + min_periods=min_periods, + ignore_na=ignore_nulls, + ) + .mean() + ) + + def ewm_mean_by(self, by, *, half_life: int | None = None) -> "Series": + """ + Calculate the exponential weighted mean by group. + + Args: + by: Grouping Series. + + Returns: + Exponential weighted mean Series. + """ + raise NotImplementedError("not yet") + + def ewm_std( + self, + com: int | None = None, + span: int | None = None, + half_life: int | None = None, + alpha: float | None = None, + *, + adjust: bool = True, + min_periods: int = 1, + ignore_nulls: bool | None = None, + ) -> "Series": + """ + Calculate the exponential weighted standard deviation. + + Args: + com: Center of mass. + span: Span. + + Returns: + Exponential weighted standard deviation Series. + """ + return self.__constructor__( + self.to_pandas() + .ewm( + com=com, + span=span, + halflife=half_life, + alpha=alpha, + adjust=adjust, + min_periods=min_periods, + ignore_na=ignore_nulls, + ) + .std() + ) + + def ewm_var( + self, + com: int | None = None, + span: int | None = None, + half_life: int | None = None, + alpha: float | None = None, + *, + adjust: bool = True, + min_periods: int = 1, + ignore_nulls: bool | None = None, + ) -> "Series": + """ + Calculate the exponential weighted variance. + + Args: + com: Center of mass. + span: Span. + + Returns: + Exponential weighted variance Series. + """ + return self.__constructor__( + self.to_pandas() + .ewm( + com=com, + span=span, + halflife=half_life, + alpha=alpha, + adjust=adjust, + min_periods=min_periods, + ignore_na=ignore_nulls, + ) + .var() + ) + + def exp(self) -> "Series": + """ + Calculate the exponential values. + + Returns: + Exponential values Series. + """ + return self.__constructor__(self.to_pandas().exp()) + + def hash( + self, + seed: int = 0, + seed_1: int | None = None, + seed_2: int | None = None, + seed_3: int | None = None, + ) -> "Series": + """ + Calculate the hash values. + + Args: + seed: Seed. + seed_1: Seed 1. + seed_2: Seed 2. + seed_3: Seed 3. + + Returns: + Hash values Series. + """ + raise NotImplementedError("not yet") + + def hist( + self, + bins: list[float] | None = None, + *, + bin_count: int | None = None, + include_category: bool = True, + include_breakpoint: bool = True, + ) -> "Series": + """ + Calculate the histogram. + + Args: + bins: Bins. + bin_count: Bin count. + + Returns: + Histogram Series. + """ + raise NotImplementedError("not yet") + + def is_between(self, lower_bound, upper_bound, closed: str = "both") -> "Series": + """ + Check if values are between the bounds. + + Args: + lower_bound: Lower bound. + upper_bound: Upper bound. + closed: Closed bounds. + + Returns: + Boolean Series. + """ + raise NotImplementedError("not yet") + + def kurtosis(self, *, fisher: bool = True, bias: bool = True) -> float | None: + """ + Calculate the kurtosis. + + Args: + fisher: Fisher method. + bias: Bias method. + + Returns: + Kurtosis. + """ + return self.to_pandas().kurtosis(fisher=fisher, bias=bias) + + def log(self, base: float = 2.718281828459045) -> "Series": + """ + Calculate the logarithm values. + + Args: + base: Logarithm base. + + Returns: + Logarithm values Series. + """ + raise NotImplementedError("not yet") + + def log10(self) -> "Series": + """ + Calculate the base 10 logarithm values. + + Returns: + Base 10 logarithm values Series. + """ + return self.log(10) + + def log1p(self) -> "Series": + """ + Calculate the natural logarithm of 1 plus the values. + + Returns: + Natural logarithm of 1 plus the values Series. + """ + raise NotImplementedError("not yet") + + def replace( + self, + mapping: dict[Any, Any], + *, + default: Any = None, + return_dtype=None, + ) -> "Series": + """ + Map values to other values. + + Args: + mapping: Mapping. + + Returns: + Mapped Series. + """ + return self.__constructor__( + self.to_pandas().apply(lambda x: mapping.get(x, default)) + ) + + def pct_change(self, n: int = 1) -> "Series": + """ + Calculate the percentage change. + + Args: + n: Number of periods to shift. + + Returns: + Percentage change Series. + """ + return self.__constructor__(self.to_pandas().pct_change(n)) + + def peak_max(self) -> "Series": + """ + Get the peak maximum values. + + Returns: + Peak maximum values Series. + """ + return self.__eq__(self.max()) + + def peak_min(self) -> "Series": + """ + Get the peak minimum values. + + Returns: + Peak minimum values Series. + """ + return self.__eq__(self.min()) + + def rank( + self, + method: str = "average", + *, + descending: bool = False, + seed: int | None = None, + ) -> "Series": + """ + Calculate the rank. + + Args: + method: Rank method. + + Returns: + Rank Series. + """ + # TODO: support seed + if method not in ["average", "min", "max", "first", "dense"]: + raise ValueError(f"method {method} not supported") + return self.__constructor__( + self.to_pandas().rank(method=method, ascending=not descending) + ) + + def rolling_map( + self, + function: callable, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .apply(function) + ) + + def rolling_max( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling maximum function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .max() + ) + + def rolling_mean( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling mean function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .mean() + ) + + def rolling_median( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling median function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .median() + ) + + def rolling_min( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling minimum function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .min() + ) + + def rolling_quantile( + self, + window_size: int, + quantile: float, + interpolation: str = "nearest", + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling quantile function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .quantile(quantile, interpolation=interpolation) + ) + + def rolling_skew(self, window_size: int, *, bias: bool = False) -> "Series": + """ + Apply a rolling skewness function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + return self.__constructor__(self.to_pandas().rolling(window=window_size).skew()) + + def rolling_std( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ddof: int = 1, + ) -> "Series": + """ + Apply a rolling standard deviation function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .std(ddof=ddof) + ) + + def rolling_sum( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ) -> "Series": + """ + Apply a rolling sum function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .sum() + ) + + def rolling_var( + self, + window_size: int, + weights: list[float] | None = None, + min_periods: int = 1, + *, + center: bool = False, + ddof: int = 1, + ) -> "Series": + """ + Apply a rolling variance function. + + Args: + function: Function to apply. + window_size: Window size. + + Returns: + Applied Series. + """ + if weights is not None: + raise NotImplementedError("not yet") + return self.__constructor__( + self.to_pandas() + .rolling(window=window_size, min_periods=min_periods, center=center) + .var(ddof=ddof) + ) + + def search_sorted(self, element, side: str = "any") -> int | "Series": + """ + Search for the element in the sorted Series. + + Args: + element: Element to search. + side: Side to search. + + Returns: + Index of the element. + """ + if side == "any": + side = "left" + return self.__constructor__(self.to_pandas().searchsorted(element, side=side)) + + def sign(self) -> "Series": + """ + Get the sign values. + + Returns: + Sign values Series. + """ + return self.__lt__(0).__mul__(-1).__add__(self.__gt__(0)) + + def sin(self) -> "Series": + """ + Get the sine values. + + Returns: + Sine values Series. + """ + raise NotImplementedError("not yet") + + def sinh(self) -> "Series": + """ + Get the hyperbolic sine values. + + Returns: + Hyperbolic sine values Series. + """ + raise NotImplementedError("not yet") + + def skew(self, *, bias: bool = True) -> float: + """ + Calculate the skewness. + + Args: + bias: Bias method. + + Returns: + Skewness. + """ + return self.to_pandas().skew() + + def sqrt(self) -> "Series": + """ + Get the square root values. + + Returns: + Square root values Series. + """ + return self.__constructor__(self.to_pandas().sqrt()) + + def tan(self) -> "Series": + """ + Get the tangent values. + + Returns: + Tangent values Series. + """ + raise NotImplementedError("not yet") + + def tanh(self) -> "Series": + """ + Get the hyperbolic tangent values. + + Returns: + Hyperbolic tangent values Series. + """ + raise NotImplementedError("not yet") + + def chunk_lengths(self) -> list[int]: + """ + Get the chunk lengths. + + Returns: + Chunk lengths. + """ + raise NotImplementedError("not yet") + + def describe( + self, + percentiles: Sequence[float] | float | None = (0.25, 0.5, 0.75), + interpolation: str = "nearest", + ): + """ + Generate descriptive statistics. + + Args: + percentiles: Percentiles to calculate. + + Returns: + Descriptive statistics. + """ + return self.to_pandas().describe(percentiles=percentiles) + + def estimated_size(self) -> int: + """ + Get the estimated size. + + Returns: + Estimated size. + """ + return self.to_pandas().memory_usage(index=False) + + def has_nulls(self) -> bool: + """ + Check if there are null values. + + Returns: + True if there are null values, False otherwise. + """ + return self.to_pandas().isnull().any() + + has_validity = has_nulls + + def is_finite(self) -> "Series": + """ + Check if the values are finite. + + Returns: + True if the values are finite, False otherwise. + """ + return self.__ne__(np.inf) + + def is_first_distinct(self) -> "Series": + """ + Check if the values are the first occurrence. + + Returns: + True if the values are the first occurrence, False otherwise. + """ + raise NotImplementedError("not yet") + + def is_in(self, other: "Series" | list[Any]) -> "Series": + """ + Check if the values are in the other Series. + + Args: + other: Other Series. + + Returns: + True if the values are in the other Series, False otherwise. + """ + return self.__constructor__(self.to_pandas().isin(other)) + + def is_infinite(self) -> "Series": + """ + Check if the values are infinite. + + Returns: + True if the values are infinite, False otherwise. + """ + return self.__eq__(np.inf) + + def is_last_distinct(self) -> "Series": + """ + Check if the values are the last occurrence. + + Returns: + True if the values are the last occurrence, False otherwise. + """ + raise NotImplementedError("not yet") + + def is_nan(self) -> "Series": + """ + Check if the values are NaN. + + Returns: + True if the values are NaN, False otherwise. + """ + return self.__constructor__(_query_compiler=self._query_compiler.isna()) + + def is_not_nan(self) -> "Series": + """ + Check if the values are not NaN. + + Returns: + True if the values are not NaN, False otherwise. + """ + return self.__constructor__(_query_compiler=self._query_compiler.notna()) + + def is_not_null(self) -> "Series": + """ + Check if the values are not null. + + Returns: + True if the values are not null, False otherwise. + """ + return self.is_not_nan() + + def is_null(self) -> "Series": + """ + Check if the values are null. + + Returns: + True if the values are null, False otherwise. + """ + return self.is_nan() + + def is_sorted( + self, + *, + descending: bool = False, + nulls_last: bool = False, + ) -> bool: + """ + Check if the values are sorted. + + Args: + descending: Descending order. + + Returns: + True if the values are sorted, False otherwise. + """ + return ( + self.to_pandas().is_monotonic_increasing + if not descending + else self.to_pandas().is_monotonic_decreasing + ) + + def len(self) -> int: + """ + Get the length of the values. + + Returns: + Length of the values Series. + """ + return len(self.to_pandas()) + + def lower_bound(self) -> "Series": + """ + Get the lower bound values. + + Returns: + Lower bound values Series. + """ + raise NotImplementedError("not yet") + + def null_count(self) -> int: + """ + Get the number of null values. + + Returns: + Number of null values. + """ + return self.to_pandas().isnull().sum() + + def unique_counts(self) -> "Series": + """ + Get the unique counts. + + Returns: + Unique counts. + """ + return self.__constructor__(values=self.to_pandas().value_counts()) + + def upper_bound(self) -> "Series": + """ + Get the upper bound values. + + Returns: + Upper bound values Series. + """ + raise NotImplementedError("not yet") + + def value_counts( + self, *, sort: bool = False, parallel: bool = False, name: str = "count" + ) -> "DataFrame": + """ + Get the value counts. + + Returns: + Value counts. + """ + from modin.polars import DataFrame + + return DataFrame( + self.to_pandas().value_counts(sort=sort).reset_index(drop=False, names=name) + ) + + def to_frame(self, name: str | None = None) -> "DataFrame": + """ + Convert the Series to a DataFrame. + + Args: + name: Name of the Series. + + Returns: + DataFrame representation of the Series. + """ + from modin.polars import DataFrame + + return DataFrame(_query_compiler=self._query_compiler).rename({self.name: name}) + + def to_init_repr(self, n: int = 1000) -> str: + """ + Convert Series to instantiatable string representation. + + Args: + n: First n elements. + + Returns: + Instantiatable string representation. + """ + return polars.from_pandas( + self.slice(0, n)._query_compiler.to_pandas() + ).to_init_repr() + + @property + def list(self): + # TODO: implement list object + # https://docs.pola.rs/api/python/stable/reference/series/list.html + raise NotImplementedError("not yet") + + def alias(self, name: str) -> "Series": + """ + Rename the Series. + + Args: + name: New name. + + Returns: + Renamed Series. + """ + return self.to_frame(name).to_series() + + def append(self, other: "Series") -> "Series": + """ + Append another Series. + + Args: + other: Other Series. + + Returns: + Appended Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.concat(0, other._query_compiler) + ) + + def arg_sort( + self, *, descending: bool = False, nulls_last: bool = False + ) -> "Series": + """ + Get the sorted indices. + + Args: + descending: Descending order. + + Returns: + Sorted indices Series. + """ + # TODO: implement nulls_last + result = self.__constructor__(values=self.to_pandas().argsort()) + if descending: + return result.reverse() + else: + return result + + def ceil(self) -> "Series": + """ + Get the ceiling values. + + Returns: + Ceiling values Series. + """ + raise NotImplementedError("not yet") + + def clear(self, n: int = 0) -> "Series": + """ + Create an empty copy of the current Series, with zero to ā€˜nā€™ elements. + + Args: + n: Number of elements. + + Returns: + Series will n nulls. + """ + raise NotImplementedError("not yet") + + def clip(self, lower_bound=None, upper_bound=None) -> "Series": + """ + Clip the values. + + Args: + lower_bound: Lower bound. + upper_bound: Upper bound. + + Returns: + Clipped values Series. + """ + return self.__constructor__( + values=self.to_pandas().clip(lower_bound, upper_bound) + ) + + def cut( + self, + breaks: Sequence[float], + *, + labels: list[str] | None = None, + break_point_label: str = "breakpoint", + left_closed: bool = False, + include_breaks: bool = False, + as_series: bool = True, + ) -> "BasePolarsDataset": + raise NotImplementedError("not yet") + + def extend_constant(self, value) -> "Series": + """ + Extend the Series with a constant value. + + Args: + value: Constant value. + + Returns: + Extended Series. + """ + raise NotImplementedError("not yet") + + def floor(self) -> "BasePolarsDataset": + return self.__floordiv__(1) + + def gather(self, indices) -> "Series": + """ + Gather values by indices. + + Args: + indices: Indices. + + Returns: + Gathered Series. + """ + return self.__constructor__( + values=self.to_pandas().iloc[ + ( + indices._query_compiler + if hasattr(indices, "_query_compiler") + else indices + ) + ] + ) + + def interpolate_by(self, by) -> "Series": + """ + Interpolate values by group. + + Args: + by: Grouping Series. + + Returns: + Interpolated Series. + """ + raise NotImplementedError("not yet") + + def item(self, index: int | None = None) -> Any: + """ + Get the item at the index. + + Args: + index: Index. + + Returns: + Item at the index. + """ + return self.to_pandas().iloc[index] + + def new_from_index(self, index: int, length: int) -> "Series": + """ + Create a new Series from the index. + + Args: + index: Index. + length: Length. + + Returns: + New Series. + """ + raise NotImplementedError("not yet") + + def qcut( + self, + quantiles: Sequence[float] | int, + *, + labels: Sequence[str] | None = None, + left_closed: bool = False, + allow_duplicates: bool = False, + include_breaks: bool = False, + break_point_label: str = "breakpoint", + category_labels: str = "category", + as_series: bool = True, + ) -> "Series" | "DataFrame": + """ + Bin continuous values into discrete categories based on quantiles. + + Args: + quantiles: Number of quantiles or sequence of quantiles. + labels: Labels for the resulting bins. + left_closed: Whether the intervals are left-closed. + allow_duplicates: Whether to allow duplicate intervals. + include_breaks: Whether to include the breaks in the result. + break_point_label: Label for the break points. + category_labels: Label for the categories. + as_series: Whether to return a Series. + + Returns: + Binned Series. + """ + raise NotImplementedError("not yet") + + def rechunk(self, *, in_place: bool = False) -> "Series": + """ + Rechunk the Series. + + Args: + in_place: In-place operation. + + Returns: + Rechunked Series. + """ + raise NotImplementedError("not yet") + + rename = alias + + def reshape(self, dimensions, nested_type) -> "Series": + """ + Reshape the Series. + + Args: + dimensions: Dimensions. + nested_type: Nested type. + + Returns: + Reshaped Series. + """ + raise NotImplementedError("not yet") + + def reverse(self) -> "Series": + """ + Reverse the Series. + + Returns: + Reversed Series. + """ + return self.__constructor__(values=self.to_pandas().iloc[::-1]) + + def rle(self) -> "Series": + """ + Run-length encode the Series. + + Returns: + Run-length encoded Series. + """ + raise NotImplementedError("not yet") + + def rle_id(self) -> "Series": + """ + Run-length encode the Series with IDs. + + Returns: + Run-length encoded Series with IDs. + """ + raise NotImplementedError("not yet") + + def round(self, decimals: int = 0) -> "Series": + """ + Round the values. + + Args: + decimals: Number of decimals. + + Returns: + Rounded values Series. + """ + return self.__constructor__(values=self.to_pandas().round(decimals)) + + def round_sig_figs(self, digits: int) -> "Series": + """ + Round the values to significant figures. + + Args: + digits: Number of significant figures. + + Returns: + Rounded values Series. + """ + raise NotImplementedError("not yet") + + def scatter(self, indices, values) -> "Series": + """ + Scatter values by indices. + + Args: + indices: Indices. + values: Values. + + Returns: + Scattered Series. + """ + raise NotImplementedError("not yet") + + def set(self, filter: "Series", value: int | float | str | bool | None) -> "Series": + """ + Set values by filter. + + Args: + filter: Filter. + value: Value. + + Returns: + Set Series. + """ + raise NotImplementedError("not yet") + + def shrink_dtype(self) -> "Series": + """ + Shrink the data type. + + Returns: + Shrunk Series. + """ + raise NotImplementedError("not yet") + + def shuffle(self, seed: int | None = None) -> "Series": + """ + Shuffle the Series. + + Args: + seed: Seed. + + Returns: + Shuffled Series. + """ + raise NotImplementedError("not yet") + + def zip_with(self, mask: "Series", other: "Series") -> "Series": + """ + Zip the Series with another Series. + + Args: + mask: Mask Series. + other: Other Series. + + Returns: + Zipped Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.where( + mask._query_compiler, other._query_compiler + ) + ) + + def map_elements( + self, + function: callable, + return_dtype=None, + *, + skip_nulls: bool = True, + ) -> "Series": + """ + Map the elements. + + Args: + function: Function to apply. + + Returns: + Mapped Series. + """ + if return_dtype is not None or skip_nulls is False: + ErrorMessage.warn( + "`return_dtype` and `skip_nulls=False` are not supported yet" + ) + return self.__constructor__(values=self.to_pandas().apply(function)) + + def reinterpret(self, *, signed: bool = True) -> "Series": + """ + Reinterpret the data type of the series as signed or unsigned. + + Args: + signed: If True, reinterpret as signed, otherwise as unsigned. + + Returns: + Reinterpreted Series. + """ + raise NotImplementedError("not yet") + + def set_sorted(self, *, descending: bool = False) -> "Series": + """ + Set the Series as sorted. + + Args: + descending: Descending order. + + Returns: + Sorted Series. + """ + self._sorted = True + self._descending = descending + return self + + def to_physical(self) -> "Series": + """ + Convert the Series to physical. + + Returns: + Physical Series. + """ + raise NotImplementedError("not yet") + + def get_chunks(self) -> list["Series"]: + """ + Get the chunks. + + Returns: + Chunks. + """ + raise NotImplementedError("not yet") + + @property + def str(self): + # TODO: implement str object + # https://docs.pola.rs/api/python/stable/reference/series/string.html + raise NotImplementedError("not yet") + + @property + def struct(self): + # TODO: implement struct object + # https://docs.pola.rs/api/python/stable/reference/series/struct.html + raise NotImplementedError("not yet") + + @property + def dt(self): + # TODO: implement dt object + # https://docs.pola.rs/api/python/stable/reference/series/temporal.html + raise NotImplementedError("not yet") + + def __len__(self) -> int: + """ + Get the length of the Series. + """ + return self.len() + + def __matmul__(self, other) -> "Series": + """ + Matrix multiplication. + + Args: + other: Other Series. + + Returns: + Matrix multiplication Series. + """ + raise NotImplementedError("not yet") + + def __radd__(self, other) -> "Series": + """ + Right addition. + + Args: + other: Other Series. + + Returns: + Added Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.radd(other, axis=0) + ) + + def __rand__(self, other) -> "Series": + """ + Right and. + + Args: + other: Other Series. + + Returns: + And Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.__rand__(other, axis=0) + ) + + def __rfloordiv__(self, other) -> "Series": + """ + Right floor division. + + Args: + other: Other Series. + + Returns: + Floored Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rfloordiv(other, axis=0) + ) + + def __rmatmul__(self, other) -> "Series": + """ + Right matrix multiplication. + + Args: + other: Other Series. + + Returns: + Matrix multiplication Series. + """ + raise NotImplementedError("not yet") + + def __rmod__(self, other) -> "Series": + """ + Right modulo. + + Args: + other: Other Series. + + Returns: + Modulo Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rmod(other, axis=0) + ) + + def __rmul__(self, other) -> "Series": + """ + Right multiplication. + + Args: + other: Other Series. + + Returns: + Multiplied Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rmul(other, axis=0) + ) + + def __ror__(self, other) -> "Series": + """ + Right or. + + Args: + other: Other Series. + + Returns: + Or Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.__ror__(other, axis=0) + ) + + def __rpow__(self, other) -> "Series": + """ + Right power. + + Args: + other: Other Series. + + Returns: + Powered Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rpow(other, axis=0) + ) + + def __rsub__(self, other) -> "Series": + """ + Right subtraction. + + Args: + other: Other Series. + + Returns: + Subtracted Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rsub(other, axis=0) + ) + + def __rtruediv__(self, other) -> "Series": + """ + Right true division. + + Args: + other: Other Series. + + Returns: + Divided Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.rtruediv(other, axis=0) + ) + + def __rxor__(self, other) -> "Series": + """ + Right xor. + + Args: + other: Other Series. + + Returns: + Xor Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.__rxor__(other, axis=0) + ) + + def eq(self, other) -> "Series": + """ + Check if the values are equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.eq(other._query_compiler) + ) + + def eq_missing(self, other) -> "Series": + """ + Check if the values are equal to the other Series, including missing values. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + raise NotImplementedError("not yet") + + def ge(self, other) -> "Series": + """ + Check if the values are greater than or equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.ge(other._query_compiler) + ) + + def gt(self, other) -> "Series": + """ + Check if the values are greater than the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.gt(other._query_compiler) + ) + + def le(self, other) -> "Series": + """ + Check if the values are less than or equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.le(other._query_compiler) + ) + + def lt(self, other) -> "Series": + """ + Check if the values are less than the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.lt(other._query_compiler) + ) + + def n_unique(self) -> int: + """ + Get the number of unique values. + + Returns: + Number of unique values. + """ + return self._query_compiler.nunique().to_pandas().squeeze(axis=None) + + def ne(self, other) -> "Series": + """ + Check if the values are not equal to the other Series. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.ne(other._query_compiler) + ) + + def ne_missing(self, other) -> "Series": + """ + Check if the values are not equal to the other Series, including missing values. + + Args: + other: Other Series. + + Returns: + Boolean Series. + """ + raise NotImplementedError("not yet") + + def pow(self, exponent) -> "Series": + """ + Raise the values to the power of the exponent. + + Args: + exponent: Exponent. + + Returns: + Powered Series. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.pow(exponent, axis=0) + ) + + def replace_strict( + self, old, new=no_default, *, default=no_default, return_dtype=None + ) -> "Series": + """ + Replace values strictly. + + Args: + old: Old values. + new: New values. + default: Default value. + + Returns: + Replaced Series. + """ + raise NotImplementedError("not yet") + + def to_list(self) -> list: + """ + Convert the Series to a list. + + Returns: + List representation of the Series. + """ + return self._to_polars().tolist() + + def drop_nans(self) -> "Series": + """ + Drop NaN values. + + Returns: + Series without NaN values. + """ + return self.__constructor__( + _query_compiler=self._query_compiler.dropna(how="any") + ) diff --git a/modin/tests/polars/test_dataframe.py b/modin/tests/polars/test_dataframe.py new file mode 100644 index 00000000000..29936c0b0f7 --- /dev/null +++ b/modin/tests/polars/test_dataframe.py @@ -0,0 +1,25 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import polars +import polars.testing + +import modin.polars as pl + + +def test_init_roundtrip(): + data = {"a": [1, 2, 3], "b": [4, 5, 6]} + df = pl.DataFrame(data) + polars_df = polars.DataFrame(data) + to_polars = polars.from_pandas(df._query_compiler.to_pandas()) + polars.testing.assert_frame_equal(polars_df, to_polars)