From 0642028e4923adddfb204fa9331e2e89de454252 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 31 Aug 2023 11:22:51 +0200 Subject: [PATCH] replace column with expression --- .../dataframe_api/__init__.py | 164 ++++--- .../dataframe_api/dataframe_object.py | 186 ++------ ...{column_object.py => expression_object.py} | 441 ++++++++---------- ...olumn_object.rst => expression_object.rst} | 8 +- spec/API_specification/index.rst | 12 +- spec/purpose_and_scope.md | 7 +- 6 files changed, 355 insertions(+), 463 deletions(-) rename spec/API_specification/dataframe_api/{column_object.py => expression_object.py} (56%) rename spec/API_specification/{column_object.rst => expression_object.rst} (50%) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 1e7d57b4..e39aa4ff 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -3,9 +3,9 @@ """ from __future__ import annotations -from typing import Mapping, Sequence, Any +from typing import Mapping, Sequence, Any, Literal -from .column_object import * +from .expression_object import * from .dataframe_object import DataFrame from .groupby_object import * from ._types import DType @@ -13,11 +13,10 @@ __all__ = [ "__dataframe_api_version__", "DataFrame", - "Column", - "column_from_sequence", - "column_from_1d_array", + "col", "concat", - "dataframe_from_dict", + "sorted_indices", + "unique_indices", "dataframe_from_2d_array", "is_null", "null", @@ -43,6 +42,21 @@ implementation of the dataframe API standard. """ +def col(name: str) -> Expression: + """ + Instantiate an Expression which selects given column by name. + + For example, to select column 'species' and then use it to filter + a DataFrame, you could do: + + .. code-block::python + + df: DataFrame + namespace = df.__dataframe_namespace__() + df.get_rows_by_mask(pl.col('species') == 'setosa') + """ + ... + def concat(dataframes: Sequence[DataFrame]) -> DataFrame: """ Concatenate DataFrames vertically. @@ -63,104 +77,116 @@ def concat(dataframes: Sequence[DataFrame]) -> DataFrame: """ ... -def column_from_sequence(sequence: Sequence[Any], *, dtype: Any, name: str = '', api_version: str | None = None) -> Column[Any]: +def any_rowwise(keys: list[str] | None = None, *, skip_nulls: bool = True) -> Expression: """ - Construct Column from sequence of elements. + Reduction returns an Expression. + + Differs from ``DataFrame.any`` and that the reduction happens + for each row, rather than for each column. Parameters ---------- - sequence : Sequence[object] - Sequence of elements. Each element must be of the specified - ``dtype``, the corresponding Python builtin scalar type, or - coercible to that Python scalar type. - name : str, optional - Name of column. - dtype : DType - Dtype of result. Must be specified. - api_version: str | None - A string representing the version of the dataframe API specification - in ``'YYYY.MM'`` form, for example, ``'2023.04'``. - If it is ``None``, it should return an object corresponding to - latest version of the dataframe API specification. If the given - version is invalid or not implemented for the given module, an - error should be raised. Default: ``None``. + keys : list[str] + Column names to consider. If `None`, all columns are considered. - Returns - ------- - Column + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. """ ... -def dataframe_from_dict(data: Mapping[str, Column[Any]], *, api_version: str | None = None) -> DataFrame: +def all_rowwise(keys: list[str] | None = None, *, skip_nulls: bool = True) -> Expression: """ - Construct DataFrame from map of column names to Columns. + Reduction returns a Column. + + Differs from ``DataFrame.all`` and that the reduction happens + for each row, rather than for each column. Parameters ---------- - data : Mapping[str, Column] - Column must be of the corresponding type of the DataFrame. - For example, it is only supported to build a ``LibraryXDataFrame`` using - ``LibraryXColumn`` instances. - api_version: str | None - A string representing the version of the dataframe API specification - in ``'YYYY.MM'`` form, for example, ``'2023.04'``. - If it is ``None``, it should return an object corresponding to - latest version of the dataframe API specification. If the given - version is invalid or not implemented for the given module, an - error should be raised. Default: ``None``. + keys : list[str] + Column names to consider. If `None`, all columns are considered. - Returns - ------- - DataFrame - Raises ------ ValueError - If any of the columns already has a name, and the corresponding key - in `data` doesn't match. - + If any of the DataFrame's columns is not boolean. """ ... +def sorted_indices( + keys: str | list[str] | None = None, + *, + ascending: Sequence[bool] | bool = True, + nulls_position: Literal['first', 'last'] = 'last', +) -> Expression: + """ + Return row numbers which would sort according to given columns. + + If you need to sort the DataFrame, use :meth:`DataFrame.sort`. -def column_from_1d_array(array: Any, *, dtype: Any, name: str = '', api_version: str | None = None) -> Column[Any]: + Parameters + ---------- + keys : str | list[str], optional + Names of columns to sort by. + If `None`, sort by all columns. + ascending : Sequence[bool] or bool + If `True`, sort by all keys in ascending order. + If `False`, sort by all keys in descending order. + If a sequence, it must be the same length as `keys`, + and determines the direction with which to use each + key to sort by. + nulls_position : ``{'first', 'last'}`` + Whether null values should be placed at the beginning + or at the end of the result. + Note that the position of NaNs is unspecified and may + vary based on the implementation. + + Returns + ------- + Expression + + Raises + ------ + ValueError + If `keys` and `ascending` are sequences of different lengths. """ - Construct Column from 1D array. + ... - See `dataframe_from_2d_array` for related 2D function. - Only Array-API-compliant 1D arrays are supported. - Cross-kind casting is undefined and may vary across implementations. - Downcasting is disallowed. +def unique_indices(keys: str | list[str] | None = None, *, skip_nulls: bool = True) -> Expression: + """ + Return indices corresponding to unique values across selected columns. Parameters ---------- - array : array - array-API compliant 1D array - name : str, optional - Name to give columns. - dtype : DType - Dtype of column. - api_version: str | None - A string representing the version of the dataframe API specification - in ``'YYYY.MM'`` form, for example, ``'2023.04'``. - If it is ``None``, it should return an object corresponding to - latest version of the dataframe API specification. If the given - version is invalid or not implemented for the given module, an - error should be raised. Default: ``None``. + keys : str | list[str], optional + Column names to consider when finding unique values. + If `None`, all columns are considered. Returns ------- - Column + Expression + Indices corresponding to unique values. + + Notes + ----- + There are no ordering guarantees. In particular, if there are multiple + indices corresponding to the same unique value(s), there is no guarantee + about which one will appear in the result. + If the original column(s) contain multiple `'NaN'` values, then + only a single index corresponding to those values will be returned. + Likewise for null values (if ``skip_nulls=False``). + To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``. """ ... -def dataframe_from_2d_array(array: Any, *, names: Sequence[str], dtypes: Mapping[str, Any], api_version: str | None = None) -> DataFrame: + +def dataframe_from_2d_array(array: Any, *, names: Sequence[str], dtypes: Mapping[str, Any]) -> DataFrame: """ Construct DataFrame from 2D array. - See `column_from_1d_array` for related 1D function. - Only Array-API-compliant 2D arrays are supported. Cross-kind casting is undefined and may vary across implementations. Downcasting is disallowed. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 7383e1d2..8bf1fb86 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -4,7 +4,7 @@ if TYPE_CHECKING: - from .column_object import Column + from .expression_object import Expression from .groupby_object import GroupBy from . import Bool from ._types import NullType, Scalar @@ -90,36 +90,30 @@ def groupby(self, keys: str | list[str], /) -> GroupBy: """ ... - def get_column_by_name(self, name: str, /) -> Column[Any]: + def select(self, names: Sequence[str | Expression], /) -> DataFrame: """ - Select a column by name. + Select multiple columns, either by name or by expressions. Parameters ---------- - name : str + names : Sequence[str] Returns ------- - Column + DataFrame - Raises - ------ - KeyError - If the key is not present. - """ - ... + Examples + -------- + Select columns 'a' and 'b': - def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame: - """ - Select multiple columns by name. + >>> df: DataFrame + >>> df.select(['a', 'b']) - Parameters - ---------- - names : Sequence[str] + You can also pass expressions: - Returns - ------- - DataFrame + >>> df: DataFrame + >>> namespace = df.__dataframe_namespace__() + >>> df.select(['a', namespace.col('b')+1]) Raises ------ @@ -128,13 +122,13 @@ def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame: """ ... - def get_rows(self, indices: Column[Any]) -> DataFrame: + def get_rows(self, indices: Expression) -> DataFrame: """ Select a subset of rows, similar to `ndarray.take`. Parameters ---------- - indices : Column[int] + indices : Expression Positions of rows to select. Returns @@ -161,70 +155,85 @@ def slice_rows( """ ... - def get_rows_by_mask(self, mask: Column[Bool]) -> DataFrame: + def get_rows_by_mask(self, mask: Expression) -> DataFrame: """ Select a subset of rows corresponding to a mask. Parameters ---------- - mask : Column[bool] + mask : Expression Returns ------- DataFrame - Notes - ----- - Some participants preferred a weaker type Arraylike[bool] for mask, - where 'Arraylike' denotes an object adhering to the Array API standard. + Examples + -------- + + Here is how you could keep rows in a dataframe where the values in + column 'a' are greater than 3: + + >>> df: DataFrame + >>> namespace = df.__dataframe_namespace__() + >>> mask = namespace.col('a') > 3 + >>> df = df.get_rows_by_mask(mask) """ ... - def insert_column(self, column: Column[Any]) -> DataFrame: + def insert_column(self, column: Expression) -> DataFrame: """ Insert column into DataFrame at rightmost location. The column's name will be used as the label in the resulting dataframe. - To insert the column with a different name, combine with `Column.rename`, + To insert the column with a different name, combine with `Expression.rename`, e.g.: .. code-block:: python - new_column = df.get_column_by_name('a') + 1 + df: DataFrame + namespace = df.__dataframe_namespace__() + col = namespace.col + new_column = namespace.col('a') + 1 df = df.insert_column(new_column.rename('a_plus_1')) If you need to insert the column at a different location, combine with - :meth:`get_columns_by_name`, e.g.: + :meth:`select`, e.g.: .. code-block:: python - new_column = df.get_column_by_name('a') + 1 + df: DataFrame + namespace = df.__dataframe_namespace__() + col = namespace.col + new_column = namespace.col('a') + 1 new_columns_names = ['a_plus_1'] + df.get_column_names() df = df.insert_column(new_column.rename('a_plus_1')) - df = df.get_columns_by_name(new_column_names) + df = df.select(new_column_names) Parameters ---------- - column : Column + expression : Expression """ ... - def update_columns(self, columns: Column[Any] | Sequence[Column[Any]], /) -> DataFrame: + def update_columns(self, columns: Expression | Sequence[Expression], /) -> DataFrame: """ Update values in existing column(s) from Dataframe. The column's name will be used to tell which column to update. - To update a column with a different name, combine with :meth:`Column.rename`, + To update a column with a different name, combine with :meth:`Expression.rename`, e.g.: .. code-block:: python - new_column = df.get_column_by_name('a') + 1 - df = df.update_column(new_column.rename('b')) + df: DataFrame + namespace = df.__dataframe_namespace__() + col = namespace.col + new_column = namespace.col('a') + 1 + df = df.update_columns(new_column.rename('b')) Parameters ---------- - columns : Column | Sequence[Column] + columns : Expression | Sequence[Expression] Column(s) to update. If updating multiple columns, they must all have different names. @@ -289,7 +298,7 @@ def sort( Sort dataframe according to given columns. If you only need the indices which would sort the dataframe, use - :meth:`sorted_indices`. + :func:`dataframe_api.sorted_indices`. Parameters ---------- @@ -319,46 +328,6 @@ def sort( """ ... - def sorted_indices( - self, - keys: str | list[str] | None = None, - *, - ascending: Sequence[bool] | bool = True, - nulls_position: Literal['first', 'last'] = 'last', - ) -> Column[Any]: - """ - Return row numbers which would sort according to given columns. - - If you need to sort the DataFrame, use :meth:`sort`. - - Parameters - ---------- - keys : str | list[str], optional - Names of columns to sort by. - If `None`, sort by all columns. - ascending : Sequence[bool] or bool - If `True`, sort by all keys in ascending order. - If `False`, sort by all keys in descending order. - If a sequence, it must be the same length as `keys`, - and determines the direction with which to use each - key to sort by. - nulls_position : ``{'first', 'last'}`` - Whether null values should be placed at the beginning - or at the end of the result. - Note that the position of NaNs is unspecified and may - vary based on the implementation. - - Returns - ------- - Column[int] - - Raises - ------ - ValueError - If `keys` and `ascending` are sequences of different lengths. - """ - ... - def __eq__(self, other: DataFrame | Scalar) -> DataFrame: # type: ignore[override] """ Compare for equality. @@ -692,34 +661,6 @@ def all(self, *, skip_nulls: bool = True) -> DataFrame: """ ... - def any_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: - """ - Reduction returns a Column. - - Differs from ``DataFrame.any`` and that the reduction happens - for each row, rather than for each column. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def all_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: - """ - Reduction returns a Column. - - Differs from ``DataFrame.all`` and that the reduction happens - for each row, rather than for each column. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - def min(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. @@ -826,33 +767,6 @@ def is_nan(self) -> DataFrame: """ ... - def unique_indices(self, keys: str | list[str] | None = None, *, skip_nulls: bool = True) -> Column[int]: - """ - Return indices corresponding to unique values across selected columns. - - Parameters - ---------- - keys : str | list[str], optional - Column names to consider when finding unique values. - If `None`, all columns are considered. - - Returns - ------- - Column[int] - Indices corresponding to unique values. - - Notes - ----- - There are no ordering guarantees. In particular, if there are multiple - indices corresponding to the same unique value(s), there is no guarantee - about which one will appear in the result. - If the original column(s) contain multiple `'NaN'` values, then - only a single index corresponding to those values will be returned. - Likewise for null values (if ``skip_nulls=False``). - To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``. - """ - ... - def fill_nan(self, value: float | NullType, /) -> DataFrame: """ Fill ``nan`` values with the given fill value. diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/expression_object.py similarity index 56% rename from spec/API_specification/dataframe_api/column_object.py rename to spec/API_specification/dataframe_api/expression_object.py index c8eb666f..6fdf4f68 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/expression_object.py @@ -9,85 +9,76 @@ from ._types import NullType, Scalar -__all__ = ['Column'] +__all__ = ['Expression'] -class Column(Generic[DType]): +class Expression: """ - Column object + Expression object, which maps a DataFrame to a column. - Note that this column object is not meant to be instantiated directly by - users of the library implementing the dataframe API standard. Rather, use - constructor functions or an already-created dataframe object retrieved via + Not meant to be used directly - instead, use :func:`dataframe_api.col`. - """ + An expression is a function which maps a DataFrame to a column, and can be + used within the context of: - def __column_namespace__(self) -> Any: - """ - Returns an object that has all the Dataframe Standard API functions on it. + - :meth:`DataFrame.select` + - :meth:`DataFrame.insert_column` + - :meth:`DataFrame.update_columns` + - :meth:`DataFrame.get_rows_by_mask` - Returns - ------- - namespace: Any - An object representing the dataframe API namespace. It should have - every top-level function defined in the specification as an - attribute. It may contain other public names as well, but it is - recommended to only include those names that are part of the - specification. + Example: - """ - - @property - def column(self) -> Any: - """ - Return underlying (not-necessarily-Standard-compliant) column. + .. code-block::python - If a library only implements the Standard, then this can return `self`. - """ - ... + df: DataFrame + namespace = df.__dataframe_namespace__() + col = namespace.col + df = df.select(col(['a', 'b'])) - @property - def name(self) -> str: - """Return name of column.""" + resolves to (pandas syntax): - def __len__(self) -> int: - """ - Return the number of rows. - """ - - def __iter__(self) -> NoReturn: - """ - Iterate over elements. + .. code-block::python - This is intentionally "poisoned" to discourage inefficient code patterns. + df: pd.DataFrame + df = df.loc[:, ['a', 'b']] + + Multiple column calls can be chained together. For example: + + .. code-block::python + + df: DataFrame + namespace = df.__dataframe_namespace__() + col = namespace.col + new_column = ( + (col('petal_width') - col('petal_width').mean()) + .rename('petal_width_centered') + ) + df = df.insert_column(new_column) + + resolves to (pandas syntax) - Raises - ------ - NotImplementedError - """ - raise NotImplementedError("'__iter__' is intentionally not implemented.") + .. code-block::python - @property - def dtype(self) -> Any: - """ - Return data type of column. - """ + df: pd.DataFrame + new_column = ( + (df['petal_width'] - df['petal_width'].mean()) + .rename('petal_width_centered') + ) + df[new_column.name] = new_column + """ - def get_rows(self: Column[DType], indices: Column[Any]) -> Column[DType]: + def __len__(self) -> Expression: """ - Select a subset of rows, similar to `ndarray.take`. - - Parameters - ---------- - indices : Column[int] - Positions of rows to select. + Return the number of rows. """ - ... + @property + def name(self) -> str: + """Return output name of expression.""" def slice_rows( - self: Column[DType], start: int | None, stop: int | None, step: int | None - ) -> Column[DType]: + self: Expression, start: int | None, stop: int | None, step: int | None + ) -> Expression: """ Select a subset of rows corresponding to a slice. @@ -99,32 +90,25 @@ def slice_rows( Returns ------- - Column + Expression """ ... - - def get_rows_by_mask(self: Column[DType], mask: Column[Bool]) -> Column[DType]: + def get_rows_by_mask(self, mask: Expression) -> Expression: """ Select a subset of rows corresponding to a mask. Parameters ---------- - mask : Column[bool] + mask : Expression Returns ------- - Column - - Notes - ----- - Some participants preferred a weaker type Arraylike[bool] for mask, - where 'Arraylike' denotes an object adhering to the Array API standard. + Expression """ ... - - def get_value(self, row_number: int) -> Scalar: + def get_value(self, row_number: int) -> Expression: """ Select the value at a row number, similar to `ndarray.__getitem__()`. @@ -135,9 +119,7 @@ def get_value(self, row_number: int) -> Scalar: Returns ------- - Scalar - Depends on the dtype of the Column, and may vary - across implementations. + Expression """ ... @@ -146,12 +128,12 @@ def sort( *, ascending: bool = True, nulls_position: Literal['first', 'last'] = 'last', - ) -> Column[DType]: + ) -> Expression: """ - Sort column. + Sort expression. - If you need the indices which would sort the column, - use :meth:`sorted_indices`. + If you need the indices which would sort the expression, + use :func:`sorted_indices`. Parameters ---------- @@ -166,7 +148,7 @@ def sort( Returns ------- - Column + Expression """ ... @@ -175,11 +157,11 @@ def sorted_indices( *, ascending: bool = True, nulls_position: Literal['first', 'last'] = 'last', - ) -> Column[Any]: + ) -> Expression: """ - Return row numbers which would sort column. + Return row numbers which would sort expression. - If you need to sort the Column, use :meth:`sort`. + If you need to sort the expression, use :meth:`sort`. Parameters ---------- @@ -194,11 +176,11 @@ def sorted_indices( Returns ------- - Column[int] + Expression """ ... - def __eq__(self, other: Column[Any] | Scalar) -> Column[Bool]: # type: ignore[override] + def __eq__(self, other: Expression | Scalar) -> Expression: # type: ignore[override] """ Compare for equality. @@ -206,17 +188,17 @@ def __eq__(self, other: Column[Any] | Scalar) -> Column[Bool]: # type: ignore[o Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __ne__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: # type: ignore[override] + def __ne__(self: Expression, other: Expression | Scalar) -> Expression: # type: ignore[override] """ Compare for non-equality. @@ -224,94 +206,94 @@ def __ne__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __ge__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: + def __ge__(self: Expression, other: Expression | Scalar) -> Expression: """ Compare for "greater than or equal to" `other`. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __gt__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: + def __gt__(self: Expression, other: Expression | Scalar) -> Expression: """ Compare for "greater than" `other`. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __le__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: + def __le__(self: Expression, other: Expression | Scalar) -> Expression: """ Compare for "less than or equal to" `other`. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __lt__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: + def __lt__(self: Expression, other: Expression | Scalar) -> Expression: """ Compare for "less than" `other`. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __and__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: + def __and__(self: Expression, other: Expression | bool) -> Expression: """ - Apply logical 'and' to `other` Column (or scalar) and this Column. + Apply logical 'and' to `other` expression (or scalar) and this expression. Nulls should follow Kleene Logic. Parameters ---------- - other : Column[bool] or bool - If Column, must have same length. + other : Expression[bool] or bool + If expression, must have same length. Returns ------- - Column + Expression Raises ------ @@ -319,20 +301,20 @@ def __and__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: + def __or__(self: Expression, other: Expression | bool) -> Expression: """ - Apply logical 'or' to `other` Column (or scalar) and this column. + Apply logical 'or' to `other` expression (or scalar) and this expression. Nulls should follow Kleene Logic. Parameters ---------- - other : Column[bool] or Scalar - If Column, must have same length. + other : Expression[bool] or Scalar + If expression, must have same length. Returns ------- - Column[bool] + Expression[bool] Raises ------ @@ -340,89 +322,89 @@ def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __add__(self: Column[Any], other: Column[Any] | Scalar) -> Column[Any]: + def __add__(self: Expression, other: Expression | Scalar) -> Expression: """ - Add `other` column or scalar to this column. + Add `other` expression or scalar to this expression. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __sub__(self: Column[Any], other: Column[Any] | Scalar) -> Column[Any]: + def __sub__(self: Expression, other: Expression | Scalar) -> Expression: """ - Subtract `other` column or scalar from this column. + Subtract `other` expression or scalar from this expression. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __mul__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __mul__(self, other: Expression | Scalar) -> Expression: """ - Multiply `other` column or scalar with this column. + Multiply `other` expression or scalar with this expression. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __truediv__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __truediv__(self, other: Expression | Scalar) -> Expression: """ - Divide this column by `other` column or scalar. True division, returns floats. + Divide this expression by `other` expression or scalar. True division, returns floats. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __floordiv__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __floordiv__(self, other: Expression | Scalar) -> Expression: """ - Floor-divide `other` column or scalar to this column. + Floor-divide `other` expression or scalar to this expression. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __pow__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __pow__(self, other: Expression | Scalar) -> Expression: """ - Raise this column to the power of `other`. + Raise this expression to the power of `other`. Integer dtype to the power of non-negative integer dtype is integer dtype. Integer dtype to the power of float dtype is float dtype. @@ -430,104 +412,104 @@ def __pow__(self, other: Column[Any] | Scalar) -> Column[Any]: Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __mod__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __mod__(self, other: Expression | Scalar) -> Expression: """ - Returns modulus of this column by `other` (`%` operator). + Returns modulus of this expression by `other` (`%` operator). Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + Expression """ - def __divmod__(self, other: Column[Any] | Scalar) -> tuple[Column[Any], Column[Any]]: + def __divmod__(self, other: Expression | Scalar) -> tuple[Expression, Expression]: """ Return quotient and remainder of integer division. See `divmod` builtin function. Parameters ---------- - other : Column or Scalar - If Column, must have same length. + other : Expression or Scalar + If expression, must have same length. "Scalar" here is defined implicitly by what scalar types are allowed for the operation by the underling dtypes. Returns ------- - Column + tuple[Expression, Expression] """ - def __invert__(self: Column[Bool]) -> Column[Bool]: + def __invert__(self: Expression) -> Expression: """ Invert truthiness of (boolean) elements. Raises ------ ValueError - If any of the Column's columns is not boolean. + If any of the expression's expressions is not boolean. """ - def any(self: Column[Bool], *, skip_nulls: bool = True) -> bool | NullType: + def any(self: Expression, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a bool. Raises ------ ValueError - If column is not boolean. + If expression is not boolean. """ - def all(self: Column[Bool], *, skip_nulls: bool = True) -> bool | NullType: + def all(self: Expression, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a bool. Raises ------ ValueError - If column is not boolean. + If expression is not boolean. """ - def min(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def min(self, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a scalar. Any data type that supports comparisons - must be supported. The returned value has the same dtype as the column. + must be supported. The returned value has the same dtype as the expression. """ - def max(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def max(self, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a scalar. Any data type that supports comparisons - must be supported. The returned value has the same dtype as the column. + must be supported. The returned value has the same dtype as the expression. """ - def sum(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def sum(self, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. The returned value has the same dtype as the - column. + expression. """ - def prod(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def prod(self, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a scalar. Must be supported for numerical data types. - The returned value has the same dtype as the column. + The returned value has the same dtype as the expression. """ - def median(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def median(self, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -535,7 +517,7 @@ def median(self, *, skip_nulls: bool = True) -> Scalar | NullType: dtypes. """ - def mean(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def mean(self, *, skip_nulls: bool = True) -> Expression: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -543,7 +525,7 @@ def mean(self, *, skip_nulls: bool = True) -> Scalar | NullType: dtypes. """ - def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar | NullType: + def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Expression: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -559,17 +541,17 @@ def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar where ``N`` corresponds to the total number of elements over which the standard deviation is computed. When computing the standard deviation of a population, setting this parameter to ``0`` is the - standard choice (i.e., the provided column contains data + standard choice (i.e., the provided expression contains data constituting an entire population). When computing the corrected sample standard deviation, setting this parameter to ``1`` is the - standard choice (i.e., the provided column contains data sampled + standard choice (i.e., the provided expression contains data sampled from a larger population; this is commonly referred to as Bessel's correction). Fractional (float) values are allowed. Default: ``1``. skip_nulls Whether to skip null values. """ - def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar | NullType: + def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Expression: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -581,44 +563,44 @@ def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar correction Correction to apply to the result. For example, ``0`` for sample standard deviation and ``1`` for population standard deviation. - See `Column.std` for a more detailed description. + See `expression.std` for a more detailed description. skip_nulls Whether to skip null values. """ - def cumulative_max(self: Column[DType]) -> Column[DType]: + def cumulative_max(self: Expression) -> Expression: """ - Reduction returns a Column. Any data type that supports comparisons - must be supported. The returned value has the same dtype as the column. + Reduction returns a expression. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the expression. """ - def cumulative_min(self: Column[DType]) -> Column[DType]: + def cumulative_min(self: Expression) -> Expression: """ - Reduction returns a Column. Any data type that supports comparisons - must be supported. The returned value has the same dtype as the column. + Reduction returns a expression. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the expression. """ - def cumulative_sum(self: Column[DType]) -> Column[DType]: + def cumulative_sum(self: Expression) -> Expression: """ - Reduction returns a Column. Must be supported for numerical and + Reduction returns a expression. Must be supported for numerical and datetime data types. The returned value has the same dtype as the - column. + expression. """ - def cumulative_prod(self: Column[DType]) -> Column[DType]: + def cumulative_prod(self: Expression) -> Expression: """ - Reduction returns a Column. Must be supported for numerical and + Reduction returns a expression. Must be supported for numerical and datetime data types. The returned value has the same dtype as the - column. + expression. """ - def is_null(self) -> Column[Bool]: + def is_null(self) -> Expression: """ Check for 'missing' or 'null' entries. Returns ------- - Column + Expression See also -------- @@ -631,13 +613,13 @@ def is_null(self) -> Column[Bool]: but note that the Standard makes no guarantees about them. """ - def is_nan(self) -> Column[Bool]: + def is_nan(self) -> Expression: """ Check for nan entries. Returns ------- - Column + Expression See also -------- @@ -650,31 +632,31 @@ def is_nan(self) -> Column[Bool]: In particular, does not check for `np.timedelta64('NaT')`. """ - def is_in(self: Column[DType], values: Column[DType]) -> Column[Bool]: + def is_in(self: Expression, values: Expression) -> Expression: """ Indicate whether the value at each row matches any value in `values`. Parameters ---------- - values : Column + values : Expression Contains values to compare against. May include ``float('nan')`` and ``null``, in which case ``'nan'`` and ``null`` will respectively return ``True`` even though ``float('nan') == float('nan')`` isn't ``True``. - The dtype of ``values`` must match the current column's dtype. + The dtype of ``values`` must match the current expression's dtype. Returns ------- - Column[bool] + Expression[bool] """ - def unique_indices(self, *, skip_nulls: bool = True) -> Column[Any]: + def unique_indices(self, *, skip_nulls: bool = True) -> Expression: """ - Return indices corresponding to unique values in Column. + Return indices corresponding to unique values in expression. Returns ------- - Column[int] + Expression[int] Indices corresponding to unique values. Notes @@ -682,87 +664,52 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Column[Any]: There are no ordering guarantees. In particular, if there are multiple indices corresponding to the same unique value, there is no guarantee about which one will appear in the result. - If the original Column contains multiple `'NaN'` values, then + If the original expression contains multiple `'NaN'` values, then only a single index corresponding to those values will be returned. Likewise for null values (if ``skip_nulls=False``). To get the unique values, you can do ``col.get_rows(col.unique_indices())``. """ ... - def fill_nan(self: Column[DType], value: float | NullType, /) -> Column[DType]: + def fill_nan(self: Expression, value: float | NullType, /) -> Expression: """ Fill floating point ``nan`` values with the given fill value. Parameters ---------- value : float or `null` - Value used to replace any ``nan`` in the column with. Must be - of the Python scalar type matching the dtype of the column (or + Value used to replace any ``nan`` in the expression with. Must be + of the Python scalar type matching the dtype of the expression (or be `null`). """ ... - def fill_null(self: Column[DType], value: Scalar, /) -> Column[DType]: + def fill_null(self: Expression, value: Scalar, /) -> Expression: """ Fill null values with the given fill value. Parameters ---------- value : Scalar - Value used to replace any ``null`` values in the column with. - Must be of the Python scalar type matching the dtype of the column. + Value used to replace any ``null`` values in the expression with. + Must be of the Python scalar type matching the dtype of the expression. """ ... - def to_array_object(self, dtype: Any) -> Any: - """ - Convert to array-API-compliant object. - - Parameters - ---------- - dtype : DType - The dtype of the array-API-compliant object to return. - Must be one of: - - - Bool() - - Int8() - - Int16() - - Int32() - - Int64() - - UInt8() - - UInt16() - - UInt32() - - UInt64() - - Float32() - - Float64() - - Returns - ------- - Any - An array-API-compliant object. - - Notes - ----- - While numpy arrays are not yet array-API-compliant, implementations - may choose to return a numpy array (for numpy prior to 2.0), with the - understanding that consuming libraries would then use the - ``array-api-compat`` package to convert it to a Standard-compliant array. - """ - - def rename(self, name: str) -> Column[DType]: + def rename(self, name: str) -> Expression: """ - Rename column. + Rename expression. Parameters ---------- name : str - New name for column. + New name for expression. Returns ------- - Column - New column - this does not operate in-place. + Expression + New expression - this does not operate in-place. """ ... diff --git a/spec/API_specification/column_object.rst b/spec/API_specification/expression_object.rst similarity index 50% rename from spec/API_specification/column_object.rst rename to spec/API_specification/expression_object.rst index 3201b500..83e53a2e 100644 --- a/spec/API_specification/column_object.rst +++ b/spec/API_specification/expression_object.rst @@ -1,12 +1,12 @@ .. _column-object: -Column object -============= +Expression object +================= A conforming implementation of the dataframe API standard must provide and -support a column object having the following methods, attributes, and +support an expression object having the following methods, attributes, and behavior. .. currentmodule:: dataframe_api -.. autoclass:: Column +.. autoclass:: Expression diff --git a/spec/API_specification/index.rst b/spec/API_specification/index.rst index 1809c87a..779ed854 100644 --- a/spec/API_specification/index.rst +++ b/spec/API_specification/index.rst @@ -16,6 +16,11 @@ of objects and functions in the top-level namespace. The latter are: __dataframe_api_version__ is_null null + col + sorted_indices + unique_indices + any_rowwise + all_rowwise Int64 Int32 Int16 @@ -28,17 +33,14 @@ of objects and functions in the top-level namespace. The latter are: Float32 Bool is_dtype - column_from_sequence - column_from_1d_array - dataframe_from_dict dataframe_from_2d_array -The ``DataFrame``, ``Column`` and ``GroupBy`` objects have the following +The ``DataFrame``, ``Expression`` and ``GroupBy`` objects have the following methods and attributes: .. toctree:: :maxdepth: 3 dataframe_object - column_object + expression_object groupby_object diff --git a/spec/purpose_and_scope.md b/spec/purpose_and_scope.md index 199d1a74..45e6e759 100644 --- a/spec/purpose_and_scope.md +++ b/spec/purpose_and_scope.md @@ -285,16 +285,19 @@ df_polars = pl.scan_parquet('iris.parquet') def my_dataframe_agnostic_function(df): df = df.__dataframe_consortium_standard__(api_version='2023.08-beta') + namespace = df.__dataframe_namespace__() - mask = df.get_column_by_name('species') != 'setosa' + mask = namespace.col('species') != 'setosa' df = df.get_rows_by_mask(mask) + new_columns = [] for column_name in df.get_column_names(): if column_name == 'species': continue new_column = df.get_column_by_name(column_name) new_column = (new_column - new_column.mean()) / new_column.std() - df = df.insert(loc=len(df.get_column_names()), label=f'{column_name}_scaled', value=new_column) + new_columns.append(new_columns) + df = df.update_columns(new_columns) return df.dataframe