From 751a131ea8d855d6ff74a6555cf9cc974c7a26cf Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 31 Oct 2023 16:25:07 +0000 Subject: [PATCH 01/26] note what may raise --- spec/design_topics/python_builtin_types.md | 34 ++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index c85812eb..5144589c 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -30,8 +30,8 @@ builtin types to CPU. In the above example, the `.mean()` call returns a `float`. It is likely beneficial though to implement this as a library-specific scalar object which duck types with `float`. This means that it should (a) have the same semantics as a builtin `float` when used within a library, and (b) -support usage as a `float` outside of the library (i.e., implement -`__float__`). Duck typing is usually not perfect, for example `isinstance` +support usage as a `float` outside of the library (see below). +Duck typing is usually not perfect, for example `isinstance` usage on the float-like duck type will behave differently. Such explicit "type of object" checks don't have to be supported. @@ -39,3 +39,33 @@ The following design rule applies everywhere builtin Python types are used within this API standard: _where a Python builtin type is specified, an implementation may always replace it by an equivalent library-specific type that duck types with the Python builtin type._ + +## Required methods + +A ducktyped float scalar is required to implement all the methods which `float` implements, +but note that those which require returning a Python scalar may raise +(depending on the implementation). + +For example, if a library implements `FancyFloat` and `FancyBool` scalars, +then the following should all be supported: +```python +df: DataFrame +column_1: Column = df.col('a') +column_2: Column = df.col('b') + +scalar: FancyFloat = column_1.std() +result_1: Column = column_2 - column_1.std() +result_2: FancyBool = column_2.std() > column_1.std() +``` +The following, however, may raise, dependening on the +implementation: +```python +df: DataFrame +column = df.col('a') + +if column.std() > 0: # this line may raise! + print('std is positive') +``` +This is because `if column.std() > 0` will call `(column.std() > 0).__bool__()`, +which must produce a Python scalar. Therefore, a purely lazy dataframe library +may choose to raise here. From fc65648b0e9f41ffdd1f0b4f47b6ff632f6c1f4f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 12:27:14 +0000 Subject: [PATCH 02/26] list required methods --- spec/design_topics/python_builtin_types.md | 36 +++++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 5144589c..430d3696 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -42,9 +42,36 @@ that duck types with the Python builtin type._ ## Required methods -A ducktyped float scalar is required to implement all the methods which `float` implements, -but note that those which require returning a Python scalar may raise -(depending on the implementation). +If a library doesn't use the Python built-in scalars, then its scalars must implement +at least the following operations which return scalars: +- `__lt__` +- `__le__` +- `__eq__` +- `__ne__` +- `__gt__` +- `__ge__` +- `__add__` +- `__radd__` +- `__sub__` +- `__rsub__` +- `__mul__` +- `__rmul__` +- `__mod__` +- `__rmod__` +- `__pow__` +- `__rpow__` +- `__floordiv__` +- `__rfloordiv__` +- `__truediv__` +- `__rtruediv__` +- `__neg__` +- `__abs__` + +Furthermore, unless the library exclusively allows for lazy execution, +it must also implement the following unary operations which return Python scalars: +- `__int__` +- `__float__` +- `__bool__` For example, if a library implements `FancyFloat` and `FancyBool` scalars, then the following should all be supported: @@ -68,4 +95,5 @@ if column.std() > 0: # this line may raise! ``` This is because `if column.std() > 0` will call `(column.std() > 0).__bool__()`, which must produce a Python scalar. Therefore, a purely lazy dataframe library -may choose to raise here. +may choose to raise here, whereas as one which allows for eager execution may return +a Python bool. From 7c24afdbaf542520c19b8bcb61878ec9562c5725 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 16:38:18 +0000 Subject: [PATCH 03/26] add scalar class --- .../dataframe_api/__init__.py | 4 +- .../dataframe_api/column_object.py | 6 +- .../dataframe_api/dataframe_object.py | 9 +- .../dataframe_api/scalar_object.py | 88 +++++++++++++++++++ .../API_specification/dataframe_api/typing.py | 37 +++----- 5 files changed, 113 insertions(+), 31 deletions(-) create mode 100644 spec/API_specification/dataframe_api/scalar_object.py diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 3a20f54b..ebad74c6 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -4,12 +4,12 @@ """ from __future__ import annotations -from typing import Dict, Sequence, Any, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Dict, Sequence from .column_object import * from .dataframe_object import DataFrame -from .groupby_object import * from .dtypes import * +from .groupby_object import * if TYPE_CHECKING: from .typing import DType, Scalar diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 915f016c..1a2376aa 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -1,11 +1,13 @@ from __future__ import annotations -from typing import Any,NoReturn, TYPE_CHECKING, Literal, Protocol +from typing import TYPE_CHECKING, Any, Literal, NoReturn, Protocol if TYPE_CHECKING: - from .typing import NullType, Scalar, DType, Namespace from typing_extensions import Self + from .scalar_object import Scalar + from .typing import DType, Namespace, NullType + __all__ = ['Column'] diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index f5c17d80..3691745b 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -1,13 +1,14 @@ from __future__ import annotations -from typing import Any, Literal, Mapping, Sequence, TYPE_CHECKING, NoReturn, Protocol - +from typing import TYPE_CHECKING, Any, Literal, Mapping, NoReturn, Protocol, Sequence if TYPE_CHECKING: + from typing_extensions import Self + from .column_object import Column from .groupby_object import GroupBy - from .typing import NullType, Scalar, Namespace, DType, SupportsDataFrameAPI - from typing_extensions import Self + from .scalar_object import Scalar + from .typing import DType, Namespace, NullType, SupportsDataFrameAPI __all__ = ["DataFrame"] diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py new file mode 100644 index 00000000..53ff5d00 --- /dev/null +++ b/spec/API_specification/dataframe_api/scalar_object.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from typing import Any, Protocol + +__all__ = ['Scalar'] + + +class Scalar(Protocol): + """ + Scalar object + + Not meant to be instantiated directly, but rather created via + `:meth:Column.get_value` or one of the column reductions such + as `:meth:`Column.sum`. + """ + def __lt__(self, other: Any) -> Scalar: + ... + + def __le__(self, other: Any) -> Scalar: + ... + + def __eq__(self, other: Any) -> Scalar: # type: ignore[override] + ... + + def __ne__(self, other: Any) -> Scalar: # type: ignore[override] + ... + + def __gt__(self, other: Any) -> Scalar: + ... + + def __ge__(self, other: Any) -> Scalar: + ... + + def __add__(self, other: Any) -> Scalar: + ... + + def __radd__(self, other: Any) -> Scalar: + ... + + def __sub__(self, other: Any) -> Scalar: + ... + + def __rsub__(self, other: Any) -> Scalar: + ... + + def __mul__(self, other: Any) -> Scalar: + ... + + def __rmul__(self, other: Any) -> Scalar: + ... + + def __mod__(self, other: Any) -> Scalar: + ... + + def __rmod__(self, other: Any) -> Scalar: + ... + + def __pow__(self, other: Any) -> Scalar: + ... + + def __rpow__(self, other: Any) -> Scalar: + ... + + def __floordiv__(self, other: Any) -> Scalar: + ... + + def __rfloordiv__(self, other: Any) -> Scalar: + ... + + def __truediv__(self, other: Any) -> Scalar: + ... + + def __rtruediv__(self, other: Any) -> Scalar: + ... + + def __neg__(self) -> Scalar: + ... + + def __abs__(self) -> Scalar: + ... + + def __bool__(self) -> bool: + """ + Note that this return a Python scalar. + + Depending on the implementation, this may raise or trigger computation. + """ + ... diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index 901c0f50..0c1390d0 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -3,38 +3,32 @@ """ from __future__ import annotations -from typing import ( - TYPE_CHECKING, - Any, - Literal, - Dict, - Protocol, - Sequence, - Union, -) +from typing import TYPE_CHECKING, Any, Dict, Literal, Protocol, Sequence, Union from dataframe_api.column_object import Column from dataframe_api.dataframe_object import DataFrame -from dataframe_api.groupby_object import GroupBy, Aggregation as AggregationT +from dataframe_api.groupby_object import Aggregation as AggregationT +from dataframe_api.groupby_object import GroupBy if TYPE_CHECKING: from .dtypes import ( Bool, - Float64, - Float32, - Int64, - Int32, - Int16, - Int8, - UInt64, - UInt32, - UInt16, - UInt8, Date, Datetime, Duration, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, String, + UInt8, + UInt16, + UInt32, + UInt64, ) + from .scalar_object import Scalar DType = Union[ Bool, @@ -54,9 +48,6 @@ Duration, ] -# Type alias: Mypy needs Any, but for readability we need to make clear this -# is a Python scalar (i.e., an instance of `bool`, `int`, `float`, `str`, etc.) -Scalar = Any # null is a special object which represents a missing value. # It is not valid as a type. NullType = Any From 2714c13f51220c6a185356cdd2fdd44e61726f27 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 16:52:16 +0000 Subject: [PATCH 04/26] reword --- spec/design_topics/python_builtin_types.md | 60 +++++----------------- 1 file changed, 12 insertions(+), 48 deletions(-) diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 430d3696..0146e144 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -18,7 +18,7 @@ class DataFrame: ... class Column: - def mean(self, skip_nulls: bool = True) -> float | NullType: + def mean(self, skip_nulls: bool = True) -> Scalar | NullType: ... larger = df2 > df1.col('foo').mean() @@ -27,51 +27,11 @@ larger = df2 > df1.col('foo').mean() For a GPU dataframe library, it is desirable for all data to reside on the GPU, and not incur a performance penalty from synchronizing instances of Python builtin types to CPU. In the above example, the `.mean()` call returns a -`float`. It is likely beneficial though to implement this as a library-specific -scalar object which duck types with `float`. This means that it should (a) have -the same semantics as a builtin `float` when used within a library, and (b) -support usage as a `float` outside of the library (see below). -Duck typing is usually not perfect, for example `isinstance` -usage on the float-like duck type will behave differently. Such explicit "type -of object" checks don't have to be supported. +`Scalar`. It is likely beneficial though to implement this as a library-specific +scalar object which (partially) duck types with `float`. The required methods it +must implement are listed in `:class:Scalar`. -The following design rule applies everywhere builtin Python types are used -within this API standard: _where a Python builtin type is specified, an -implementation may always replace it by an equivalent library-specific type -that duck types with the Python builtin type._ - -## Required methods - -If a library doesn't use the Python built-in scalars, then its scalars must implement -at least the following operations which return scalars: -- `__lt__` -- `__le__` -- `__eq__` -- `__ne__` -- `__gt__` -- `__ge__` -- `__add__` -- `__radd__` -- `__sub__` -- `__rsub__` -- `__mul__` -- `__rmul__` -- `__mod__` -- `__rmod__` -- `__pow__` -- `__rpow__` -- `__floordiv__` -- `__rfloordiv__` -- `__truediv__` -- `__rtruediv__` -- `__neg__` -- `__abs__` - -Furthermore, unless the library exclusively allows for lazy execution, -it must also implement the following unary operations which return Python scalars: -- `__int__` -- `__float__` -- `__bool__` +### Example For example, if a library implements `FancyFloat` and `FancyBool` scalars, then the following should all be supported: @@ -84,6 +44,10 @@ scalar: FancyFloat = column_1.std() result_1: Column = column_2 - column_1.std() result_2: FancyBool = column_2.std() > column_1.std() ``` + +Note that the scalars above are library-specific ones - they may be used to keep +data on GPU, or to keep data lazy. + The following, however, may raise, dependening on the implementation: ```python @@ -94,6 +58,6 @@ if column.std() > 0: # this line may raise! print('std is positive') ``` This is because `if column.std() > 0` will call `(column.std() > 0).__bool__()`, -which must produce a Python scalar. Therefore, a purely lazy dataframe library -may choose to raise here, whereas as one which allows for eager execution may return -a Python bool. +which is required by Python to produce a Python scalar. +Therefore, a purely lazy dataframe library may choose to raise here, whereas as +one which allows for eager execution may return a Python bool. From 99b91a55ca3f1893a30533ed8944e83b46ace379 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 16:56:10 +0000 Subject: [PATCH 05/26] fixup --- spec/design_topics/python_builtin_types.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 0146e144..2339451a 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -31,7 +31,7 @@ builtin types to CPU. In the above example, the `.mean()` call returns a scalar object which (partially) duck types with `float`. The required methods it must implement are listed in `:class:Scalar`. -### Example +## Example For example, if a library implements `FancyFloat` and `FancyBool` scalars, then the following should all be supported: From a867f00f7eaaaadcf2424db9f7682d997142ec4d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 16:58:19 +0000 Subject: [PATCH 06/26] fixup --- spec/design_topics/python_builtin_types.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 2339451a..0b2ed617 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -29,7 +29,7 @@ and not incur a performance penalty from synchronizing instances of Python builtin types to CPU. In the above example, the `.mean()` call returns a `Scalar`. It is likely beneficial though to implement this as a library-specific scalar object which (partially) duck types with `float`. The required methods it -must implement are listed in `:class:Scalar`. +must implement are listed in the spec for class `Scalar`. ## Example From f197672db4abb9c1a55976f4fd5f670e18b2eb7a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 8 Nov 2023 18:17:19 +0000 Subject: [PATCH 07/26] fixup --- spec/API_specification/dataframe_api/column_object.py | 2 -- spec/API_specification/dataframe_api/scalar_object.py | 2 +- spec/API_specification/dataframe_api/typing.py | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index ca00dadc..e02ccc02 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -8,8 +8,6 @@ from .scalar_object import Scalar from .typing import DType, Namespace, NullType - from .typing import DType, Namespace, NullType, Scalar - __all__ = ["Column"] diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py index 0f5f9fb5..a15db639 100644 --- a/spec/API_specification/dataframe_api/scalar_object.py +++ b/spec/API_specification/dataframe_api/scalar_object.py @@ -6,7 +6,7 @@ class Scalar(Protocol): - """Scalar object + """Scalar object. Not meant to be instantiated directly, but rather created via `:meth:Column.get_value` or one of the column reductions such diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index 13ac5181..e7ee8e56 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -14,6 +14,8 @@ from dataframe_api.groupby_object import Aggregation as AggregationT from dataframe_api.groupby_object import GroupBy +from .scalar_object import Scalar + if TYPE_CHECKING: from collections.abc import Sequence @@ -34,7 +36,6 @@ UInt32, UInt64, ) - from .scalar_object import Scalar DType = Union[ Bool, @@ -181,5 +182,4 @@ def __column_consortium_standard__( "Scalar", "SupportsColumnAPI", "SupportsDataFrameAPI", - "Scalar", ] From 409d8f3cf401479a28234bc0abfbdcb1a1e0245b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 14 Nov 2023 19:09:43 +0000 Subject: [PATCH 08/26] replace Scalar|NullType with Scalar --- .../dataframe_api/column_object.py | 18 +++++++++--------- spec/design_topics/python_builtin_types.md | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index e02ccc02..33f6a560 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -518,7 +518,7 @@ def all(self, *, skip_nulls: bool = True) -> bool | NullType: """ ... - def min(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def min(self, *, skip_nulls: bool = True) -> Scalar: """Reduction returns a scalar. Any data type that supports comparisons @@ -526,7 +526,7 @@ def min(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def max(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def max(self, *, skip_nulls: bool = True) -> Scalar: """Reduction returns a scalar. Any data type that supports comparisons @@ -534,7 +534,7 @@ def max(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def sum(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def sum(self, *, skip_nulls: bool = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -543,7 +543,7 @@ def sum(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def prod(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def prod(self, *, skip_nulls: bool = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical data types. @@ -551,7 +551,7 @@ def prod(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def median(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def median(self, *, skip_nulls: bool = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -561,7 +561,7 @@ def median(self, *, skip_nulls: bool = True) -> Scalar | NullType: """ ... - def mean(self, *, skip_nulls: bool = True) -> Scalar | NullType: + def mean(self, *, skip_nulls: bool = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -576,7 +576,7 @@ def std( *, correction: int | float = 1, skip_nulls: bool = True, - ) -> Scalar | NullType: + ) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -609,7 +609,7 @@ def var( *, correction: int | float = 1, skip_nulls: bool = True, - ) -> Scalar | NullType: + ) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -738,7 +738,7 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Self: """ ... - def fill_nan(self, value: float | NullType, /) -> Self: + def fill_nan(self, value: Scalar, /) -> Self: """Fill floating point ``nan`` values with the given fill value. Parameters diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 0b2ed617..11411267 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -18,7 +18,7 @@ class DataFrame: ... class Column: - def mean(self, skip_nulls: bool = True) -> Scalar | NullType: + def mean(self, skip_nulls: bool = True) -> Scalar: ... larger = df2 > df1.col('foo').mean() From 0db987154f9aa52287b886b4ff693e6a2d784bd7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 14 Nov 2023 19:14:24 +0000 Subject: [PATCH 09/26] type null as Scalar --- spec/API_specification/dataframe_api/column_object.py | 6 +++--- spec/API_specification/dataframe_api/dataframe_object.py | 4 ++-- spec/API_specification/dataframe_api/typing.py | 7 +------ spec/conf.py | 1 - 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 33f6a560..ec921b2d 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -6,7 +6,7 @@ from typing_extensions import Self from .scalar_object import Scalar - from .typing import DType, Namespace, NullType + from .typing import DType, Namespace __all__ = ["Column"] @@ -498,7 +498,7 @@ def __invert__(self) -> Self: """ ... - def any(self, *, skip_nulls: bool = True) -> bool | NullType: + def any(self, *, skip_nulls: bool = True) -> Scalar: """Reduction returns a bool. Raises @@ -508,7 +508,7 @@ def any(self, *, skip_nulls: bool = True) -> bool | NullType: """ ... - def all(self, *, skip_nulls: bool = True) -> bool | NullType: + def all(self, *, skip_nulls: bool = True) -> Scalar: """Reduction returns a bool. Raises diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index ddba939a..5454893e 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -9,7 +9,7 @@ from .column_object import Column from .groupby_object import GroupBy - from .typing import DType, Namespace, NullType, Scalar, SupportsDataFrameAPI + from .typing import DType, Namespace, Scalar, SupportsDataFrameAPI __all__ = ["DataFrame"] @@ -803,7 +803,7 @@ def unique_indices(self, *keys: str, skip_nulls: bool = True) -> Column: """ ... - def fill_nan(self, value: float | NullType, /) -> Self: + def fill_nan(self, value: Scalar, /) -> Self: """Fill ``nan`` values with the given fill value. The fill operation will apply to all columns with a floating-point diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index 6cccf39d..ca5f12b6 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -63,10 +63,7 @@ class Bool: class Date: ... - class NullType: - ... - - null: NullType + null: Scalar class Datetime: def __init__( # noqa: ANN204 @@ -130,7 +127,6 @@ def date(self, year: int, month: int, day: int) -> Scalar: ... -NullType = Namespace.NullType DType = Union[ Namespace.Bool, Namespace.Float64, @@ -174,7 +170,6 @@ def __column_consortium_standard__( "DType", "GroupBy", "Namespace", - "NullType", "Scalar", "SupportsColumnAPI", "SupportsDataFrameAPI", diff --git a/spec/conf.py b/spec/conf.py index cc6e3270..054c012d 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -85,7 +85,6 @@ ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), - ('py:class', 'NullType'), ('py:class', 'Namespace'), ('py:class', 'SupportsDataFrameAPI'), ('py:class', 'Self'), From 6520ac438f222408d24f312fbc83705a5eb9cca5 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 14 Nov 2023 19:21:19 +0000 Subject: [PATCH 10/26] add example of working with scalars --- .../examples/05_scalars_example.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 spec/API_specification/examples/05_scalars_example.py diff --git a/spec/API_specification/examples/05_scalars_example.py b/spec/API_specification/examples/05_scalars_example.py new file mode 100644 index 00000000..b6b5f41b --- /dev/null +++ b/spec/API_specification/examples/05_scalars_example.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from dataframe_api.typing import SupportsDataFrameAPI + + +def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI: + df = df_raw.__dataframe_consortium_standard__(api_version="2023-11.beta") + + # We can fill nulls using a Scalar object. + df = df.fill_null(df.col("a").mean()) + + # Python Scalars also implement the Scalar Protocol (indeed, the Scalar + # Protocol is designed to be a subset of the Python Scalar types), so we + # can pass Python scalars too. + df = df.fill_null(3) + return df.dataframe From 46bc08c0d2427e69c9d2c1114822c939ec7505c8 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 11:44:56 +0000 Subject: [PATCH 11/26] use AnyScalar; --- .../dataframe_api/column_object.py | 58 +++++++++---------- .../dataframe_api/dataframe_object.py | 52 ++++++++--------- .../API_specification/dataframe_api/typing.py | 6 ++ .../examples/05_scalars_example.py | 8 +-- spec/conf.py | 1 + 5 files changed, 66 insertions(+), 59 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index ec921b2d..777c7593 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -6,7 +6,7 @@ from typing_extensions import Self from .scalar_object import Scalar - from .typing import DType, Namespace + from .typing import AnyScalar, DType, Namespace __all__ = ["Column"] @@ -187,7 +187,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Self | Scalar) -> Self: # type: ignore[override] + def __eq__(self, other: Self | AnyScalar) -> Self: # type: ignore[override] """Compare for equality. Nulls should follow Kleene Logic. @@ -205,7 +205,7 @@ def __eq__(self, other: Self | Scalar) -> Self: # type: ignore[override] """ ... - def __ne__(self, other: Self | Scalar) -> Self: # type: ignore[override] + def __ne__(self, other: Self | AnyScalar) -> Self: # type: ignore[override] """Compare for non-equality. Nulls should follow Kleene Logic. @@ -223,7 +223,7 @@ def __ne__(self, other: Self | Scalar) -> Self: # type: ignore[override] """ ... - def __ge__(self, other: Self | Scalar) -> Self: + def __ge__(self, other: Self | AnyScalar) -> Self: """Compare for "greater than or equal to" `other`. Parameters @@ -239,7 +239,7 @@ def __ge__(self, other: Self | Scalar) -> Self: """ ... - def __gt__(self, other: Self | Scalar) -> Self: + def __gt__(self, other: Self | AnyScalar) -> Self: """Compare for "greater than" `other`. Parameters @@ -255,7 +255,7 @@ def __gt__(self, other: Self | Scalar) -> Self: """ ... - def __le__(self, other: Self | Scalar) -> Self: + def __le__(self, other: Self | AnyScalar) -> Self: """Compare for "less than or equal to" `other`. Parameters @@ -271,7 +271,7 @@ def __le__(self, other: Self | Scalar) -> Self: """ ... - def __lt__(self, other: Self | Scalar) -> Self: + def __lt__(self, other: Self | AnyScalar) -> Self: """Compare for "less than" `other`. Parameters @@ -287,7 +287,7 @@ def __lt__(self, other: Self | Scalar) -> Self: """ ... - def __and__(self, other: Self | bool) -> Self: + def __and__(self, other: Self | AnyScalar) -> Self: """Apply logical 'and' to `other` Column (or scalar) and this Column. Nulls should follow Kleene Logic. @@ -308,7 +308,7 @@ def __and__(self, other: Self | bool) -> Self: """ ... - def __or__(self, other: Self | bool) -> Self: + def __or__(self, other: Self | AnyScalar) -> Self: """Apply logical 'or' to `other` Column (or scalar) and this column. Nulls should follow Kleene Logic. @@ -329,7 +329,7 @@ def __or__(self, other: Self | bool) -> Self: """ ... - def __add__(self, other: Self | Scalar) -> Self: + def __add__(self, other: Self | AnyScalar) -> Self: """Add `other` column or scalar to this column. Parameters @@ -345,7 +345,7 @@ def __add__(self, other: Self | Scalar) -> Self: """ ... - def __sub__(self, other: Self | Scalar) -> Self: + def __sub__(self, other: Self | AnyScalar) -> Self: """Subtract `other` column or scalar from this column. Parameters @@ -361,7 +361,7 @@ def __sub__(self, other: Self | Scalar) -> Self: """ ... - def __mul__(self, other: Self | Scalar) -> Self: + def __mul__(self, other: Self | AnyScalar) -> Self: """Multiply `other` column or scalar with this column. Parameters @@ -377,7 +377,7 @@ def __mul__(self, other: Self | Scalar) -> Self: """ ... - def __truediv__(self, other: Self | Scalar) -> Self: + def __truediv__(self, other: Self | AnyScalar) -> Self: """Divide this column by `other` column or scalar. True division, returns floats. Parameters @@ -393,7 +393,7 @@ def __truediv__(self, other: Self | Scalar) -> Self: """ ... - def __floordiv__(self, other: Self | Scalar) -> Self: + def __floordiv__(self, other: Self | AnyScalar) -> Self: """Floor-divide `other` column or scalar to this column. Parameters @@ -409,7 +409,7 @@ def __floordiv__(self, other: Self | Scalar) -> Self: """ ... - def __pow__(self, other: Self | Scalar) -> Self: + def __pow__(self, other: Self | AnyScalar) -> Self: """Raise this column to the power of `other`. Integer dtype to the power of non-negative integer dtype is integer dtype. @@ -429,7 +429,7 @@ def __pow__(self, other: Self | Scalar) -> Self: """ ... - def __mod__(self, other: Self | Scalar) -> Self: + def __mod__(self, other: Self | AnyScalar) -> Self: """Return modulus of this column by `other` (`%` operator). Parameters @@ -445,7 +445,7 @@ def __mod__(self, other: Self | Scalar) -> Self: """ ... - def __divmod__(self, other: Self | Scalar) -> tuple[Column, Column]: + def __divmod__(self, other: Self | AnyScalar) -> tuple[Column, Column]: """Return quotient and remainder of integer division. See `divmod` builtin. Parameters @@ -461,16 +461,16 @@ def __divmod__(self, other: Self | Scalar) -> tuple[Column, Column]: """ ... - def __radd__(self, other: Self | Scalar) -> Self: + def __radd__(self, other: Self | AnyScalar) -> Self: ... - def __rsub__(self, other: Self | Scalar) -> Self: + def __rsub__(self, other: Self | AnyScalar) -> Self: ... - def __rmul__(self, other: Self | Scalar) -> Self: + def __rmul__(self, other: Self | AnyScalar) -> Self: ... - def __rtruediv__(self, other: Self | Scalar) -> Self: + def __rtruediv__(self, other: Self | AnyScalar) -> Self: ... def __rand__(self, other: Self | bool) -> Self: @@ -479,13 +479,13 @@ def __rand__(self, other: Self | bool) -> Self: def __ror__(self, other: Self | bool) -> Self: ... - def __rfloordiv__(self, other: Self | Scalar) -> Self: + def __rfloordiv__(self, other: Self | AnyScalar) -> Self: ... - def __rpow__(self, other: Self | Scalar) -> Self: + def __rpow__(self, other: Self | AnyScalar) -> Self: ... - def __rmod__(self, other: Self | Scalar) -> Self: + def __rmod__(self, other: Self | AnyScalar) -> Self: ... def __invert__(self) -> Self: @@ -718,7 +718,7 @@ def is_in(self, values: Self) -> Self: """ ... - def unique_indices(self, *, skip_nulls: bool = True) -> Self: + def unique_indices(self, *, skip_nulls: AnyScalar = True) -> Self: """Return indices corresponding to unique values in Column. Returns @@ -738,7 +738,7 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Self: """ ... - def fill_nan(self, value: Scalar, /) -> Self: + def fill_nan(self, value: AnyScalar, /) -> Self: """Fill floating point ``nan`` values with the given fill value. Parameters @@ -751,7 +751,7 @@ def fill_nan(self, value: Scalar, /) -> Self: """ ... - def fill_null(self, value: Scalar, /) -> Self: + def fill_null(self, value: AnyScalar, /) -> Self: """Fill null values with the given fill value. Parameters @@ -797,7 +797,7 @@ def to_array(self) -> Any: """ ... - def rename(self, name: str) -> Self: + def rename(self, name: AnyScalar) -> Self: """Rename column. Parameters @@ -886,7 +886,7 @@ def iso_weekday(self) -> Self: """ ... - def unix_timestamp(self, *, time_unit: Literal["s", "ms", "us"] = "s") -> Self: + def unix_timestamp(self, *, time_unit: AnyScalar = "s") -> Self: """Return number of seconds / milliseconds / microseconds since the Unix epoch. The Unix epoch is 00:00:00 UTC on 1 January 1970. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 5454893e..58fd5439 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -9,7 +9,7 @@ from .column_object import Column from .groupby_object import GroupBy - from .typing import DType, Namespace, Scalar, SupportsDataFrameAPI + from .typing import AnyScalar, DType, Namespace, SupportsDataFrameAPI __all__ = ["DataFrame"] @@ -335,7 +335,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Scalar) -> Self: # type: ignore[override] + def __eq__(self, other: AnyScalar) -> Self: # type: ignore[override] """Compare for equality. Nulls should follow Kleene Logic. @@ -352,7 +352,7 @@ def __eq__(self, other: Scalar) -> Self: # type: ignore[override] """ ... - def __ne__(self, other: Scalar) -> Self: # type: ignore[override] + def __ne__(self, other: AnyScalar) -> Self: # type: ignore[override] """Compare for non-equality. Nulls should follow Kleene Logic. @@ -369,7 +369,7 @@ def __ne__(self, other: Scalar) -> Self: # type: ignore[override] """ ... - def __ge__(self, other: Scalar) -> Self: + def __ge__(self, other: AnyScalar) -> Self: """Compare for "greater than or equal to" `other`. Parameters @@ -384,7 +384,7 @@ def __ge__(self, other: Scalar) -> Self: """ ... - def __gt__(self, other: Scalar) -> Self: + def __gt__(self, other: AnyScalar) -> Self: """Compare for "greater than" `other`. Parameters @@ -399,7 +399,7 @@ def __gt__(self, other: Scalar) -> Self: """ ... - def __le__(self, other: Scalar) -> Self: + def __le__(self, other: AnyScalar) -> Self: """Compare for "less than or equal to" `other`. Parameters @@ -414,7 +414,7 @@ def __le__(self, other: Scalar) -> Self: """ ... - def __lt__(self, other: Scalar) -> Self: + def __lt__(self, other: AnyScalar) -> Self: """Compare for "less than" `other`. Parameters @@ -469,7 +469,7 @@ def __or__(self, other: bool) -> Self: # noqa: FBT001 """ ... - def __add__(self, other: Scalar) -> Self: + def __add__(self, other: AnyScalar) -> Self: """Add `other` scalar to this dataframe. Parameters @@ -484,7 +484,7 @@ def __add__(self, other: Scalar) -> Self: """ ... - def __sub__(self, other: Scalar) -> Self: + def __sub__(self, other: AnyScalar) -> Self: """Subtract `other` scalar from this dataframe. Parameters @@ -499,7 +499,7 @@ def __sub__(self, other: Scalar) -> Self: """ ... - def __mul__(self, other: Scalar) -> Self: + def __mul__(self, other: AnyScalar) -> Self: """Multiply `other` scalar with this dataframe. Parameters @@ -514,7 +514,7 @@ def __mul__(self, other: Scalar) -> Self: """ ... - def __truediv__(self, other: Scalar) -> Self: + def __truediv__(self, other: AnyScalar) -> Self: """Divide this dataframe by `other` scalar. True division, returns floats. Parameters @@ -529,7 +529,7 @@ def __truediv__(self, other: Scalar) -> Self: """ ... - def __floordiv__(self, other: Scalar) -> Self: + def __floordiv__(self, other: AnyScalar) -> Self: """Floor-divide (returns integers) this dataframe by `other` scalar. Parameters @@ -544,7 +544,7 @@ def __floordiv__(self, other: Scalar) -> Self: """ ... - def __pow__(self, other: Scalar) -> Self: + def __pow__(self, other: AnyScalar) -> Self: """Raise this dataframe to the power of `other`. Integer dtype to the power of non-negative integer dtype is integer dtype. @@ -563,7 +563,7 @@ def __pow__(self, other: Scalar) -> Self: """ ... - def __mod__(self, other: Scalar) -> Self: + def __mod__(self, other: AnyScalar) -> Self: """Return modulus of this dataframe by `other` (`%` operator). Parameters @@ -578,7 +578,7 @@ def __mod__(self, other: Scalar) -> Self: """ ... - def __divmod__(self, other: Scalar) -> tuple[DataFrame, DataFrame]: + def __divmod__(self, other: AnyScalar) -> tuple[DataFrame, DataFrame]: """Return quotient and remainder of integer division. See `divmod` builtin. Parameters @@ -593,31 +593,31 @@ def __divmod__(self, other: Scalar) -> tuple[DataFrame, DataFrame]: """ ... - def __radd__(self, other: Scalar) -> Self: + def __radd__(self, other: AnyScalar) -> Self: ... - def __rsub__(self, other: Scalar) -> Self: + def __rsub__(self, other: AnyScalar) -> Self: ... - def __rmul__(self, other: Scalar) -> Self: + def __rmul__(self, other: AnyScalar) -> Self: ... - def __rtruediv__(self, other: Scalar) -> Self: + def __rtruediv__(self, other: AnyScalar) -> Self: ... - def __rand__(self, other: Scalar) -> Self: + def __rand__(self, other: AnyScalar) -> Self: ... - def __ror__(self, other: Scalar) -> Self: + def __ror__(self, other: AnyScalar) -> Self: ... - def __rfloordiv__(self, other: Scalar) -> Self: + def __rfloordiv__(self, other: AnyScalar) -> Self: ... - def __rpow__(self, other: Scalar) -> Self: + def __rpow__(self, other: AnyScalar) -> Self: ... - def __rmod__(self, other: Scalar) -> Self: + def __rmod__(self, other: AnyScalar) -> Self: ... def __invert__(self) -> Self: @@ -803,7 +803,7 @@ def unique_indices(self, *keys: str, skip_nulls: bool = True) -> Column: """ ... - def fill_nan(self, value: Scalar, /) -> Self: + def fill_nan(self, value: AnyScalar, /) -> Self: """Fill ``nan`` values with the given fill value. The fill operation will apply to all columns with a floating-point @@ -821,7 +821,7 @@ def fill_nan(self, value: Scalar, /) -> Self: def fill_null( self, - value: Scalar, + value: AnyScalar, /, *, column_names: list[str] | None = None, diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index ca5f12b6..01a26ab2 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -164,6 +164,10 @@ def __column_consortium_standard__( ... +PythonScalar = Union[str, int, float] +AnyScalar = Union[PythonScalar, Scalar] + + __all__ = [ "Column", "DataFrame", @@ -171,6 +175,8 @@ def __column_consortium_standard__( "GroupBy", "Namespace", "Scalar", + "PythonScalar", + "AnyScalar", "SupportsColumnAPI", "SupportsDataFrameAPI", ] diff --git a/spec/API_specification/examples/05_scalars_example.py b/spec/API_specification/examples/05_scalars_example.py index b6b5f41b..9dd95461 100644 --- a/spec/API_specification/examples/05_scalars_example.py +++ b/spec/API_specification/examples/05_scalars_example.py @@ -9,11 +9,11 @@ def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI: df = df_raw.__dataframe_consortium_standard__(api_version="2023-11.beta") - # We can fill nulls using a Scalar object. + # `DataFrame.fill_null` accepts `AnyScalar` objections. + # This means we can fill nulls using a Standard Scalar object... df = df.fill_null(df.col("a").mean()) - # Python Scalars also implement the Scalar Protocol (indeed, the Scalar - # Protocol is designed to be a subset of the Python Scalar types), so we - # can pass Python scalars too. + # ... but also Python scalars: df = df.fill_null(3) + df = df.fill_null("3") return df.dataframe diff --git a/spec/conf.py b/spec/conf.py index 054c012d..d4fc0bbe 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -82,6 +82,7 @@ ('py:class', 'enum.Enum'), ('py:class', 'ellipsis'), ('py:class', 'Scalar'), + ('py:class', 'AnyScalar'), ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), From b879f31ac596451aa2986d82fd0c934758e1f84d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 12:32:54 +0000 Subject: [PATCH 12/26] add Scalar.dtype and Scalar.persist --- .../dataframe_api/scalar_object.py | 65 +++++++++++++------ .../examples/05_scalars_example.py | 15 +++++ 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py index a15db639..100be8ab 100644 --- a/spec/API_specification/dataframe_api/scalar_object.py +++ b/spec/API_specification/dataframe_api/scalar_object.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import Any, Protocol +from typing import TYPE_CHECKING, Protocol + +if TYPE_CHECKING: + from dataframe_api.typing import AnyScalar, DType __all__ = ["Scalar"] @@ -13,64 +16,66 @@ class Scalar(Protocol): as `:meth:`Column.sum`. """ - def __lt__(self, other: Any) -> Scalar: + def __lt__(self, other: AnyScalar) -> Scalar: ... - def __le__(self, other: Any) -> Scalar: + def __le__(self, other: AnyScalar) -> Scalar: ... - def __eq__(self, other: object) -> Scalar: # type: ignore[override] + def __eq__(self, other: AnyScalar) -> Scalar: # type: ignore[override] ... - def __ne__(self, other: object) -> Scalar: # type: ignore[override] + def __ne__(self, other: AnyScalar) -> Scalar: # type: ignore[override] ... - def __gt__(self, other: Any) -> Scalar: + def __gt__(self, other: AnyScalar) -> Scalar: ... - def __ge__(self, other: Any) -> Scalar: + def __ge__(self, other: AnyScalar) -> Scalar: ... - def __add__(self, other: Any) -> Scalar: + def __add__(self, other: AnyScalar) -> Scalar: ... - def __radd__(self, other: Any) -> Scalar: + def __radd__(self, other: AnyScalar) -> Scalar: ... - def __sub__(self, other: Any) -> Scalar: + def __sub__(self, other: AnyScalar) -> Scalar: ... - def __rsub__(self, other: Any) -> Scalar: + def __rsub__(self, other: AnyScalar) -> Scalar: ... - def __mul__(self, other: Any) -> Scalar: + def __mul__(self, other: AnyScalar) -> Scalar: ... - def __rmul__(self, other: Any) -> Scalar: + def __rmul__(self, other: AnyScalar) -> Scalar: ... - def __mod__(self, other: Any) -> Scalar: + def __mod__(self, other: AnyScalar) -> Scalar: ... - def __rmod__(self, other: Any) -> Scalar: + # Signatures of "__rmod__" of "Scalar" and "__mod__" of "str | int | float | Scalar" + # are unsafely overlapping + def __rmod__(self, other: AnyScalar) -> Scalar: # type: ignore[misc] ... - def __pow__(self, other: Any) -> Scalar: + def __pow__(self, other: AnyScalar) -> Scalar: ... - def __rpow__(self, other: Any) -> Scalar: + def __rpow__(self, other: AnyScalar) -> Scalar: ... - def __floordiv__(self, other: Any) -> Scalar: + def __floordiv__(self, other: AnyScalar) -> Scalar: ... - def __rfloordiv__(self, other: Any) -> Scalar: + def __rfloordiv__(self, other: AnyScalar) -> Scalar: ... - def __truediv__(self, other: Any) -> Scalar: + def __truediv__(self, other: AnyScalar) -> Scalar: ... - def __rtruediv__(self, other: Any) -> Scalar: + def __rtruediv__(self, other: AnyScalar) -> Scalar: ... def __neg__(self) -> Scalar: @@ -85,3 +90,21 @@ def __bool__(self) -> bool: Depending on the implementation, this may raise or trigger computation. """ ... + + @property + def dtype(self) -> DType: + """Return data type of column.""" + ... + + def persist(self) -> object: + """Hint that computation prior to this point should not be repeated. + + This is intended as a hint, rather than as a directive. Implementations + which do not separate lazy vs eager execution may ignore this method and + treat it as a no-op. + + .. note:: + This may trigger computation and so should be used with care. + See `execution_model` page for more details. + """ + ... diff --git a/spec/API_specification/examples/05_scalars_example.py b/spec/API_specification/examples/05_scalars_example.py index 9dd95461..acf001a8 100644 --- a/spec/API_specification/examples/05_scalars_example.py +++ b/spec/API_specification/examples/05_scalars_example.py @@ -3,6 +3,9 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: + from dataframe_api.column_object import Column + from dataframe_api.dataframe_object import DataFrame + from dataframe_api.scalar_object import Scalar from dataframe_api.typing import SupportsDataFrameAPI @@ -16,4 +19,16 @@ def main(df_raw: SupportsDataFrameAPI) -> SupportsDataFrameAPI: # ... but also Python scalars: df = df.fill_null(3) df = df.fill_null("3") + + # Scalars can be used in arithmetic expressions with other scalars, columns, + # or DataFrames + value: Scalar = df.col("a").mean() + col: Column = df.col("a") + _res1: Column = value - col + _res2: Scalar = value - 3 + _res3: Scalar = 3 - value + _res4: Column = df.col("a") - 3 + _res5: DataFrame = df - value + _res6: DataFrame = value - df + return df.dataframe From b8011c74d3729aad822e450b0e54379548d00dc4 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 12:38:02 +0000 Subject: [PATCH 13/26] update shift arg --- spec/API_specification/dataframe_api/column_object.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index a37df40e..c5ecd55d 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -812,7 +812,7 @@ def rename(self, name: AnyScalar) -> Self: """ ... - def shift(self, offset: int) -> Self: + def shift(self, offset: AnyScalar) -> Self: """Shift values by `offset` positions, filling missing values with `null`. For example, if the original column contains values `[1, 4, 2]`, then: @@ -822,7 +822,7 @@ def shift(self, offset: int) -> Self: Parameters ---------- - offset + offset : int How many positions to shift by. """ ... From 97d8f9a172cc8196126eece9f6bf3ddff5e54b1f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 14:33:32 +0000 Subject: [PATCH 14/26] use BoolScalar --- .../dataframe_api/column_object.py | 30 +++++------ .../dataframe_api/dataframe_object.py | 28 +++++----- .../dataframe_api/groupby_object.py | 51 +++++++++++-------- .../dataframe_api/scalar_object.py | 2 +- .../API_specification/dataframe_api/typing.py | 10 +++- spec/conf.py | 1 + spec/design_topics/python_builtin_types.md | 2 +- 7 files changed, 72 insertions(+), 52 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index c5ecd55d..a506f1f2 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -6,7 +6,7 @@ from typing_extensions import Self from .scalar_object import Scalar - from .typing import AnyScalar, DType, Namespace + from .typing import AnyScalar, BoolScalar, DType, Namespace __all__ = ["Column"] @@ -287,7 +287,7 @@ def __lt__(self, other: Self | AnyScalar) -> Self: """ ... - def __and__(self, other: Self | AnyScalar) -> Self: + def __and__(self, other: Self | BoolScalar) -> Self: """Apply logical 'and' to `other` Column (or scalar) and this Column. Nulls should follow Kleene Logic. @@ -308,7 +308,7 @@ def __and__(self, other: Self | AnyScalar) -> Self: """ ... - def __or__(self, other: Self | AnyScalar) -> Self: + def __or__(self, other: Self | BoolScalar) -> Self: """Apply logical 'or' to `other` Column (or scalar) and this column. Nulls should follow Kleene Logic. @@ -498,7 +498,7 @@ def __invert__(self) -> Self: """ ... - def any(self, *, skip_nulls: bool = True) -> Scalar: + def any(self, *, skip_nulls: BoolScalar = True) -> Scalar: """Reduction returns a bool. Raises @@ -508,7 +508,7 @@ def any(self, *, skip_nulls: bool = True) -> Scalar: """ ... - def all(self, *, skip_nulls: bool = True) -> Scalar: + def all(self, *, skip_nulls: BoolScalar = True) -> Scalar: """Reduction returns a bool. Raises @@ -518,7 +518,7 @@ def all(self, *, skip_nulls: bool = True) -> Scalar: """ ... - def min(self, *, skip_nulls: bool = True) -> Scalar: + def min(self, *, skip_nulls: BoolScalar = True) -> Scalar: """Reduction returns a scalar. Any data type that supports comparisons @@ -526,7 +526,7 @@ def min(self, *, skip_nulls: bool = True) -> Scalar: """ ... - def max(self, *, skip_nulls: bool = True) -> Scalar: + def max(self, *, skip_nulls: BoolScalar = True) -> Scalar: """Reduction returns a scalar. Any data type that supports comparisons @@ -534,7 +534,7 @@ def max(self, *, skip_nulls: bool = True) -> Scalar: """ ... - def sum(self, *, skip_nulls: bool = True) -> Scalar: + def sum(self, *, skip_nulls: BoolScalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -543,7 +543,7 @@ def sum(self, *, skip_nulls: bool = True) -> Scalar: """ ... - def prod(self, *, skip_nulls: bool = True) -> Scalar: + def prod(self, *, skip_nulls: BoolScalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical data types. @@ -551,7 +551,7 @@ def prod(self, *, skip_nulls: bool = True) -> Scalar: """ ... - def median(self, *, skip_nulls: bool = True) -> Scalar: + def median(self, *, skip_nulls: BoolScalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -561,7 +561,7 @@ def median(self, *, skip_nulls: bool = True) -> Scalar: """ ... - def mean(self, *, skip_nulls: bool = True) -> Scalar: + def mean(self, *, skip_nulls: BoolScalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -575,7 +575,7 @@ def std( self, *, correction: int | float = 1, - skip_nulls: bool = True, + skip_nulls: BoolScalar = True, ) -> Scalar: """Reduction returns a scalar. @@ -608,7 +608,7 @@ def var( self, *, correction: int | float = 1, - skip_nulls: bool = True, + skip_nulls: BoolScalar = True, ) -> Scalar: """Reduction returns a scalar. @@ -718,7 +718,7 @@ def is_in(self, values: Self) -> Self: """ ... - def unique_indices(self, *, skip_nulls: AnyScalar = True) -> Self: + def unique_indices(self, *, skip_nulls: BoolScalar = True) -> Self: """Return indices corresponding to unique values in Column. Returns @@ -819,7 +819,7 @@ def shift(self, offset: AnyScalar) -> Self: - `.shift(1)` will return `[null, 1, 4]`, - `.shift(-1)` will return `[4, 2, null]`, - + Parameters ---------- offset : int diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 58fd5439..c6e91f3e 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -9,7 +9,7 @@ from .column_object import Column from .groupby_object import GroupBy - from .typing import AnyScalar, DType, Namespace, SupportsDataFrameAPI + from .typing import AnyScalar, BoolScalar, DType, Namespace, SupportsDataFrameAPI __all__ = ["DataFrame"] @@ -641,7 +641,7 @@ def __iter__(self) -> NoReturn: """ raise NotImplementedError("'__iter__' is intentionally not implemented.") - def any(self, *, skip_nulls: bool = True) -> Self: + def any(self, *, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame. Raises @@ -651,7 +651,7 @@ def any(self, *, skip_nulls: bool = True) -> Self: """ ... - def all(self, *, skip_nulls: bool = True) -> Self: + def all(self, *, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame. Raises @@ -661,7 +661,7 @@ def all(self, *, skip_nulls: bool = True) -> Self: """ ... - def any_rowwise(self, *, skip_nulls: bool = True) -> Column: + def any_rowwise(self, *, skip_nulls: BoolScalar = True) -> Column: """Reduction returns a Column. Differs from ``DataFrame.any`` and that the reduction happens @@ -674,7 +674,7 @@ def any_rowwise(self, *, skip_nulls: bool = True) -> Column: """ ... - def all_rowwise(self, *, skip_nulls: bool = True) -> Column: + def all_rowwise(self, *, skip_nulls: BoolScalar = True) -> Column: """Reduction returns a Column. Differs from ``DataFrame.all`` and that the reduction happens @@ -687,31 +687,31 @@ def all_rowwise(self, *, skip_nulls: bool = True) -> Column: """ ... - def min(self, *, skip_nulls: bool = True) -> Self: + def min(self, *, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def max(self, *, skip_nulls: bool = True) -> Self: + def max(self, *, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def sum(self, *, skip_nulls: bool = True) -> Self: + def sum(self, *, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def prod(self, *, skip_nulls: bool = True) -> Self: + def prod(self, *, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def median(self, *, skip_nulls: bool = True) -> Self: + def median(self, *, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def mean(self, *, skip_nulls: bool = True) -> Self: + def mean(self, *, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Self: + def std(self, *, correction: int | float = 1, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame. Parameters @@ -725,7 +725,7 @@ def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Self: """ ... - def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Self: + def var(self, *, correction: int | float = 1, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame. Parameters @@ -777,7 +777,7 @@ def is_nan(self) -> Self: """ ... - def unique_indices(self, *keys: str, skip_nulls: bool = True) -> Column: + def unique_indices(self, *keys: str, skip_nulls: BoolScalar = True) -> Column: """Return indices corresponding to unique values across selected columns. Parameters diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index adecb8aa..6e8e0d3b 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -4,6 +4,7 @@ if TYPE_CHECKING: from .dataframe_object import DataFrame + from .typing import BoolScalar __all__ = [ @@ -22,34 +23,44 @@ class GroupBy(Protocol): """ - def any(self, *, skip_nulls: bool = True) -> DataFrame: + def any(self, *, skip_nulls: BoolScalar = True) -> DataFrame: ... - def all(self, *, skip_nulls: bool = True) -> DataFrame: + def all(self, *, skip_nulls: BoolScalar = True) -> DataFrame: ... - def min(self, *, skip_nulls: bool = True) -> DataFrame: + def min(self, *, skip_nulls: BoolScalar = True) -> DataFrame: ... - def max(self, *, skip_nulls: bool = True) -> DataFrame: + def max(self, *, skip_nulls: BoolScalar = True) -> DataFrame: ... - def sum(self, *, skip_nulls: bool = True) -> DataFrame: + def sum(self, *, skip_nulls: BoolScalar = True) -> DataFrame: ... - def prod(self, *, skip_nulls: bool = True) -> DataFrame: + def prod(self, *, skip_nulls: BoolScalar = True) -> DataFrame: ... - def median(self, *, skip_nulls: bool = True) -> DataFrame: + def median(self, *, skip_nulls: BoolScalar = True) -> DataFrame: ... - def mean(self, *, skip_nulls: bool = True) -> DataFrame: + def mean(self, *, skip_nulls: BoolScalar = True) -> DataFrame: ... - def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> DataFrame: + def std( + self, + *, + correction: int | float = 1, + skip_nulls: BoolScalar = True, + ) -> DataFrame: ... - def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> DataFrame: + def var( + self, + *, + correction: int | float = 1, + skip_nulls: BoolScalar = True, + ) -> DataFrame: ... def size(self) -> DataFrame: @@ -82,35 +93,35 @@ def rename(self, name: str) -> Aggregation: ... @classmethod - def any(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def any(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: ... @classmethod - def all(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def all(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: ... @classmethod - def min(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def min(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: ... @classmethod - def max(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def max(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: ... @classmethod - def sum(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def sum(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: ... @classmethod - def prod(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def prod(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: ... @classmethod - def median(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def median(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: ... @classmethod - def mean(cls, column: str, *, skip_nulls: bool = True) -> Aggregation: + def mean(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: ... @classmethod @@ -119,7 +130,7 @@ def std( column: str, *, correction: int | float = 1, - skip_nulls: bool = True, + skip_nulls: BoolScalar = True, ) -> Aggregation: ... @@ -129,7 +140,7 @@ def var( column: str, *, correction: int | float = 1, - skip_nulls: bool = True, + skip_nulls: BoolScalar = True, ) -> Aggregation: ... diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py index 100be8ab..2f0012a0 100644 --- a/spec/API_specification/dataframe_api/scalar_object.py +++ b/spec/API_specification/dataframe_api/scalar_object.py @@ -90,7 +90,7 @@ def __bool__(self) -> bool: Depending on the implementation, this may raise or trigger computation. """ ... - + @property def dtype(self) -> DType: """Return data type of column.""" diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index 01a26ab2..68501060 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -164,7 +164,11 @@ def __column_consortium_standard__( ... -PythonScalar = Union[str, int, float] +PythonScalar = Union[str, int, float, bool] +BoolScalar = Union[bool, Scalar] +FloatScalar = Union[float, Scalar] +IntScalar = Union[int, Scalar] +StringScalar = Union[str, Scalar] AnyScalar = Union[PythonScalar, Scalar] @@ -176,6 +180,10 @@ def __column_consortium_standard__( "Namespace", "Scalar", "PythonScalar", + "FloatScalar", + "IntScalar", + "BoolScalar", + "StringScalar", "AnyScalar", "SupportsColumnAPI", "SupportsDataFrameAPI", diff --git a/spec/conf.py b/spec/conf.py index d4fc0bbe..589e6c0e 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -83,6 +83,7 @@ ('py:class', 'ellipsis'), ('py:class', 'Scalar'), ('py:class', 'AnyScalar'), + ('py:class', 'BoolScalar'), ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 11411267..9e9fc323 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -18,7 +18,7 @@ class DataFrame: ... class Column: - def mean(self, skip_nulls: bool = True) -> Scalar: + def mean(self, skip_nulls: BoolScalar = True) -> Scalar: ... larger = df2 > df1.col('foo').mean() From 3b7bcb6ae6962856ef0be2e6687f0f37961c7ff8 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 14:43:38 +0000 Subject: [PATCH 15/26] use float scalar in some parts --- .../API_specification/dataframe_api/column_object.py | 4 ++-- .../dataframe_api/dataframe_object.py | 12 ++++++++++-- spec/API_specification/dataframe_api/typing.py | 8 ++++++-- spec/conf.py | 1 + 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index a506f1f2..03adc7af 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -6,7 +6,7 @@ from typing_extensions import Self from .scalar_object import Scalar - from .typing import AnyScalar, BoolScalar, DType, Namespace + from .typing import AnyScalar, BoolScalar, DType, FloatScalar, Namespace, NullType __all__ = ["Column"] @@ -738,7 +738,7 @@ def unique_indices(self, *, skip_nulls: BoolScalar = True) -> Self: """ ... - def fill_nan(self, value: AnyScalar, /) -> Self: + def fill_nan(self, value: FloatScalar | NullType, /) -> Self: """Fill floating point ``nan`` values with the given fill value. Parameters diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index c6e91f3e..8bd11228 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -9,7 +9,15 @@ from .column_object import Column from .groupby_object import GroupBy - from .typing import AnyScalar, BoolScalar, DType, Namespace, SupportsDataFrameAPI + from .typing import ( + AnyScalar, + BoolScalar, + DType, + FloatScalar, + Namespace, + NullType, + SupportsDataFrameAPI, + ) __all__ = ["DataFrame"] @@ -803,7 +811,7 @@ def unique_indices(self, *keys: str, skip_nulls: BoolScalar = True) -> Column: """ ... - def fill_nan(self, value: AnyScalar, /) -> Self: + def fill_nan(self, value: FloatScalar | NullType, /) -> Self: """Fill ``nan`` values with the given fill value. The fill operation will apply to all columns with a floating-point diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index 68501060..d5552605 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -63,7 +63,10 @@ class Bool: class Date: ... - null: Scalar + class NullType: + ... + + null: NullType class Datetime: def __init__( # noqa: ANN204 @@ -170,6 +173,7 @@ def __column_consortium_standard__( IntScalar = Union[int, Scalar] StringScalar = Union[str, Scalar] AnyScalar = Union[PythonScalar, Scalar] +NullType = Namespace.NullType __all__ = [ @@ -184,7 +188,7 @@ def __column_consortium_standard__( "IntScalar", "BoolScalar", "StringScalar", - "AnyScalar", + "NullType", "SupportsColumnAPI", "SupportsDataFrameAPI", ] diff --git a/spec/conf.py b/spec/conf.py index 589e6c0e..711f5569 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -84,6 +84,7 @@ ('py:class', 'Scalar'), ('py:class', 'AnyScalar'), ('py:class', 'BoolScalar'), + ('py:class', 'FloatScalar'), ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), From d598a8dddc04fe46cad45d21cd9a7fe54cd26ffb Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 14:44:01 +0000 Subject: [PATCH 16/26] use float scalar in some parts --- spec/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spec/conf.py b/spec/conf.py index 711f5569..e67f7884 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -85,6 +85,7 @@ ('py:class', 'AnyScalar'), ('py:class', 'BoolScalar'), ('py:class', 'FloatScalar'), + ('py:class', 'NullType'), ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), From a12585b8f5b5712f7acc3b4d8669e91756ebbbb7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 14:46:44 +0000 Subject: [PATCH 17/26] string scalar for rename --- .../API_specification/dataframe_api/column_object.py | 12 ++++++++++-- .../dataframe_api/groupby_object.py | 4 ++-- spec/conf.py | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 03adc7af..03a22822 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -6,7 +6,15 @@ from typing_extensions import Self from .scalar_object import Scalar - from .typing import AnyScalar, BoolScalar, DType, FloatScalar, Namespace, NullType + from .typing import ( + AnyScalar, + BoolScalar, + DType, + FloatScalar, + Namespace, + NullType, + StringScalar, + ) __all__ = ["Column"] @@ -797,7 +805,7 @@ def to_array(self) -> Any: """ ... - def rename(self, name: AnyScalar) -> Self: + def rename(self, name: StringScalar) -> Self: """Rename column. Parameters diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index 6e8e0d3b..7637f571 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -4,7 +4,7 @@ if TYPE_CHECKING: from .dataframe_object import DataFrame - from .typing import BoolScalar + from .typing import BoolScalar, StringScalar __all__ = [ @@ -85,7 +85,7 @@ def aggregate(self, *aggregation: Aggregation) -> DataFrame: class Aggregation(Protocol): - def rename(self, name: str) -> Aggregation: + def rename(self, name: StringScalar) -> Aggregation: """Assign given name to output of aggregation. If not called, the column's name will be used as the output name. diff --git a/spec/conf.py b/spec/conf.py index e67f7884..c04881b6 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -86,6 +86,7 @@ ('py:class', 'BoolScalar'), ('py:class', 'FloatScalar'), ('py:class', 'NullType'), + ('py:class', 'StringScalar'), ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), From 35cd4ed3bbf72383f277b044d6502fe24f312fbb Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 14:47:31 +0000 Subject: [PATCH 18/26] intscalar for shift --- spec/API_specification/dataframe_api/column_object.py | 3 ++- spec/API_specification/dataframe_api/typing.py | 1 + spec/conf.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 03a22822..5415cba5 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -11,6 +11,7 @@ BoolScalar, DType, FloatScalar, + IntScalar, Namespace, NullType, StringScalar, @@ -820,7 +821,7 @@ def rename(self, name: StringScalar) -> Self: """ ... - def shift(self, offset: AnyScalar) -> Self: + def shift(self, offset: IntScalar) -> Self: """Shift values by `offset` positions, filling missing values with `null`. For example, if the original column contains values `[1, 4, 2]`, then: diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index d5552605..65fb4b3f 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -188,6 +188,7 @@ def __column_consortium_standard__( "IntScalar", "BoolScalar", "StringScalar", + "IntScalar", "NullType", "SupportsColumnAPI", "SupportsDataFrameAPI", diff --git a/spec/conf.py b/spec/conf.py index c04881b6..05fa2104 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -87,6 +87,7 @@ ('py:class', 'FloatScalar'), ('py:class', 'NullType'), ('py:class', 'StringScalar'), + ('py:class', 'IntScalar'), ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), From fade164de9434c7a318b1345157578b4f0b5b946 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 14:51:52 +0000 Subject: [PATCH 19/26] numeric scalar for correction --- spec/API_specification/dataframe_api/column_object.py | 7 ++++--- .../dataframe_api/dataframe_object.py | 9 +++++++-- spec/API_specification/dataframe_api/groupby_object.py | 10 +++++----- spec/API_specification/dataframe_api/typing.py | 2 ++ spec/conf.py | 1 + 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 5415cba5..16f9bddb 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -14,6 +14,7 @@ IntScalar, Namespace, NullType, + NumericScalar, StringScalar, ) @@ -583,7 +584,7 @@ def mean(self, *, skip_nulls: BoolScalar = True) -> Scalar: def std( self, *, - correction: int | float = 1, + correction: float = 1, skip_nulls: BoolScalar = True, ) -> Scalar: """Reduction returns a scalar. @@ -616,7 +617,7 @@ def std( def var( self, *, - correction: int | float = 1, + correction: NumericScalar = 1, skip_nulls: BoolScalar = True, ) -> Scalar: """Reduction returns a scalar. @@ -912,7 +913,7 @@ def iso_weekday(self) -> Self: """ ... - def unix_timestamp(self, *, time_unit: AnyScalar = "s") -> Self: + def unix_timestamp(self, *, time_unit: StringScalar = "s") -> Self: """Return number of seconds / milliseconds / microseconds since the Unix epoch. The Unix epoch is 00:00:00 UTC on 1 January 1970. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 8bd11228..199c0bd2 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -16,6 +16,7 @@ FloatScalar, Namespace, NullType, + NumericScalar, SupportsDataFrameAPI, ) @@ -719,7 +720,9 @@ def mean(self, *, skip_nulls: BoolScalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def std(self, *, correction: int | float = 1, skip_nulls: BoolScalar = True) -> Self: + def std( + self, *, correction: NumericScalar = 1, skip_nulls: BoolScalar = True, + ) -> Self: """Reduction returns a 1-row DataFrame. Parameters @@ -733,7 +736,9 @@ def std(self, *, correction: int | float = 1, skip_nulls: BoolScalar = True) -> """ ... - def var(self, *, correction: int | float = 1, skip_nulls: BoolScalar = True) -> Self: + def var( + self, *, correction: NumericScalar = 1, skip_nulls: BoolScalar = True, + ) -> Self: """Reduction returns a 1-row DataFrame. Parameters diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index 7637f571..e01b477a 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -4,7 +4,7 @@ if TYPE_CHECKING: from .dataframe_object import DataFrame - from .typing import BoolScalar, StringScalar + from .typing import BoolScalar, NumericScalar, StringScalar __all__ = [ @@ -50,7 +50,7 @@ def mean(self, *, skip_nulls: BoolScalar = True) -> DataFrame: def std( self, *, - correction: int | float = 1, + correction: NumericScalar = 1, skip_nulls: BoolScalar = True, ) -> DataFrame: ... @@ -58,7 +58,7 @@ def std( def var( self, *, - correction: int | float = 1, + correction: NumericScalar = 1, skip_nulls: BoolScalar = True, ) -> DataFrame: ... @@ -129,7 +129,7 @@ def std( cls, column: str, *, - correction: int | float = 1, + correction: NumericScalar = 1, skip_nulls: BoolScalar = True, ) -> Aggregation: ... @@ -139,7 +139,7 @@ def var( cls, column: str, *, - correction: int | float = 1, + correction: NumericScalar = 1, skip_nulls: BoolScalar = True, ) -> Aggregation: ... diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index 65fb4b3f..10c111a6 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -171,6 +171,7 @@ def __column_consortium_standard__( BoolScalar = Union[bool, Scalar] FloatScalar = Union[float, Scalar] IntScalar = Union[int, Scalar] +NumericScalar = Union[FloatScalar, IntScalar] StringScalar = Union[str, Scalar] AnyScalar = Union[PythonScalar, Scalar] NullType = Namespace.NullType @@ -189,6 +190,7 @@ def __column_consortium_standard__( "BoolScalar", "StringScalar", "IntScalar", + "NumericScalar", "NullType", "SupportsColumnAPI", "SupportsDataFrameAPI", diff --git a/spec/conf.py b/spec/conf.py index 05fa2104..8f9dd5ff 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -88,6 +88,7 @@ ('py:class', 'NullType'), ('py:class', 'StringScalar'), ('py:class', 'IntScalar'), + ('py:class', 'NumericScalar'), ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), From 29ceed2467da0106a975f9c46b5b7a1458283e85 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 20:03:56 +0000 Subject: [PATCH 20/26] simplify --- .../dataframe_api/column_object.py | 41 +++++++-------- .../dataframe_api/dataframe_object.py | 38 ++++++++------ .../dataframe_api/groupby_object.py | 52 +++++++++---------- .../dataframe_api/scalar_object.py | 2 +- .../API_specification/dataframe_api/typing.py | 13 +---- spec/conf.py | 6 +-- spec/design_topics/python_builtin_types.md | 2 +- 7 files changed, 71 insertions(+), 83 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 16f9bddb..7a7bdb99 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -8,14 +8,9 @@ from .scalar_object import Scalar from .typing import ( AnyScalar, - BoolScalar, DType, - FloatScalar, - IntScalar, Namespace, NullType, - NumericScalar, - StringScalar, ) @@ -297,7 +292,7 @@ def __lt__(self, other: Self | AnyScalar) -> Self: """ ... - def __and__(self, other: Self | BoolScalar) -> Self: + def __and__(self, other: Self | bool | Scalar) -> Self: """Apply logical 'and' to `other` Column (or scalar) and this Column. Nulls should follow Kleene Logic. @@ -318,7 +313,7 @@ def __and__(self, other: Self | BoolScalar) -> Self: """ ... - def __or__(self, other: Self | BoolScalar) -> Self: + def __or__(self, other: Self | bool | Scalar) -> Self: """Apply logical 'or' to `other` Column (or scalar) and this column. Nulls should follow Kleene Logic. @@ -508,7 +503,7 @@ def __invert__(self) -> Self: """ ... - def any(self, *, skip_nulls: BoolScalar = True) -> Scalar: + def any(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a bool. Raises @@ -518,7 +513,7 @@ def any(self, *, skip_nulls: BoolScalar = True) -> Scalar: """ ... - def all(self, *, skip_nulls: BoolScalar = True) -> Scalar: + def all(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a bool. Raises @@ -528,7 +523,7 @@ def all(self, *, skip_nulls: BoolScalar = True) -> Scalar: """ ... - def min(self, *, skip_nulls: BoolScalar = True) -> Scalar: + def min(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Any data type that supports comparisons @@ -536,7 +531,7 @@ def min(self, *, skip_nulls: BoolScalar = True) -> Scalar: """ ... - def max(self, *, skip_nulls: BoolScalar = True) -> Scalar: + def max(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Any data type that supports comparisons @@ -544,7 +539,7 @@ def max(self, *, skip_nulls: BoolScalar = True) -> Scalar: """ ... - def sum(self, *, skip_nulls: BoolScalar = True) -> Scalar: + def sum(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -553,7 +548,7 @@ def sum(self, *, skip_nulls: BoolScalar = True) -> Scalar: """ ... - def prod(self, *, skip_nulls: BoolScalar = True) -> Scalar: + def prod(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical data types. @@ -561,7 +556,7 @@ def prod(self, *, skip_nulls: BoolScalar = True) -> Scalar: """ ... - def median(self, *, skip_nulls: BoolScalar = True) -> Scalar: + def median(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -571,7 +566,7 @@ def median(self, *, skip_nulls: BoolScalar = True) -> Scalar: """ ... - def mean(self, *, skip_nulls: BoolScalar = True) -> Scalar: + def mean(self, *, skip_nulls: bool | Scalar = True) -> Scalar: """Reduction returns a scalar. Must be supported for numerical and @@ -585,7 +580,7 @@ def std( self, *, correction: float = 1, - skip_nulls: BoolScalar = True, + skip_nulls: bool | Scalar = True, ) -> Scalar: """Reduction returns a scalar. @@ -617,8 +612,8 @@ def std( def var( self, *, - correction: NumericScalar = 1, - skip_nulls: BoolScalar = True, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, ) -> Scalar: """Reduction returns a scalar. @@ -728,7 +723,7 @@ def is_in(self, values: Self) -> Self: """ ... - def unique_indices(self, *, skip_nulls: BoolScalar = True) -> Self: + def unique_indices(self, *, skip_nulls: bool | Scalar = True) -> Self: """Return indices corresponding to unique values in Column. Returns @@ -748,7 +743,7 @@ def unique_indices(self, *, skip_nulls: BoolScalar = True) -> Self: """ ... - def fill_nan(self, value: FloatScalar | NullType, /) -> Self: + def fill_nan(self, value: float | NullType | Scalar, /) -> Self: """Fill floating point ``nan`` values with the given fill value. Parameters @@ -807,7 +802,7 @@ def to_array(self) -> Any: """ ... - def rename(self, name: StringScalar) -> Self: + def rename(self, name: str | Scalar) -> Self: """Rename column. Parameters @@ -822,7 +817,7 @@ def rename(self, name: StringScalar) -> Self: """ ... - def shift(self, offset: IntScalar) -> Self: + def shift(self, offset: int | Scalar) -> Self: """Shift values by `offset` positions, filling missing values with `null`. For example, if the original column contains values `[1, 4, 2]`, then: @@ -913,7 +908,7 @@ def iso_weekday(self) -> Self: """ ... - def unix_timestamp(self, *, time_unit: StringScalar = "s") -> Self: + def unix_timestamp(self, *, time_unit: str | Scalar = "s") -> Self: """Return number of seconds / milliseconds / microseconds since the Unix epoch. The Unix epoch is 00:00:00 UTC on 1 January 1970. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 199c0bd2..842c05b2 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -11,12 +11,10 @@ from .groupby_object import GroupBy from .typing import ( AnyScalar, - BoolScalar, DType, - FloatScalar, Namespace, NullType, - NumericScalar, + Scalar, SupportsDataFrameAPI, ) @@ -650,7 +648,7 @@ def __iter__(self) -> NoReturn: """ raise NotImplementedError("'__iter__' is intentionally not implemented.") - def any(self, *, skip_nulls: BoolScalar = True) -> Self: + def any(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame. Raises @@ -660,7 +658,7 @@ def any(self, *, skip_nulls: BoolScalar = True) -> Self: """ ... - def all(self, *, skip_nulls: BoolScalar = True) -> Self: + def all(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame. Raises @@ -670,7 +668,7 @@ def all(self, *, skip_nulls: BoolScalar = True) -> Self: """ ... - def any_rowwise(self, *, skip_nulls: BoolScalar = True) -> Column: + def any_rowwise(self, *, skip_nulls: bool | Scalar = True) -> Column: """Reduction returns a Column. Differs from ``DataFrame.any`` and that the reduction happens @@ -683,7 +681,7 @@ def any_rowwise(self, *, skip_nulls: BoolScalar = True) -> Column: """ ... - def all_rowwise(self, *, skip_nulls: BoolScalar = True) -> Column: + def all_rowwise(self, *, skip_nulls: bool | Scalar = True) -> Column: """Reduction returns a Column. Differs from ``DataFrame.all`` and that the reduction happens @@ -696,32 +694,35 @@ def all_rowwise(self, *, skip_nulls: BoolScalar = True) -> Column: """ ... - def min(self, *, skip_nulls: BoolScalar = True) -> Self: + def min(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def max(self, *, skip_nulls: BoolScalar = True) -> Self: + def max(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def sum(self, *, skip_nulls: BoolScalar = True) -> Self: + def sum(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def prod(self, *, skip_nulls: BoolScalar = True) -> Self: + def prod(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def median(self, *, skip_nulls: BoolScalar = True) -> Self: + def median(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... - def mean(self, *, skip_nulls: BoolScalar = True) -> Self: + def mean(self, *, skip_nulls: bool | Scalar = True) -> Self: """Reduction returns a 1-row DataFrame.""" ... def std( - self, *, correction: NumericScalar = 1, skip_nulls: BoolScalar = True, + self, + *, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, ) -> Self: """Reduction returns a 1-row DataFrame. @@ -737,7 +738,10 @@ def std( ... def var( - self, *, correction: NumericScalar = 1, skip_nulls: BoolScalar = True, + self, + *, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, ) -> Self: """Reduction returns a 1-row DataFrame. @@ -790,7 +794,7 @@ def is_nan(self) -> Self: """ ... - def unique_indices(self, *keys: str, skip_nulls: BoolScalar = True) -> Column: + def unique_indices(self, *keys: str, skip_nulls: bool | Scalar = True) -> Column: """Return indices corresponding to unique values across selected columns. Parameters @@ -816,7 +820,7 @@ def unique_indices(self, *keys: str, skip_nulls: BoolScalar = True) -> Column: """ ... - def fill_nan(self, value: FloatScalar | NullType, /) -> Self: + def fill_nan(self, value: float | NullType | Scalar, /) -> Self: """Fill ``nan`` values with the given fill value. The fill operation will apply to all columns with a floating-point diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index e01b477a..8b26eff3 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -4,7 +4,7 @@ if TYPE_CHECKING: from .dataframe_object import DataFrame - from .typing import BoolScalar, NumericScalar, StringScalar + from .typing import Scalar __all__ = [ @@ -23,43 +23,43 @@ class GroupBy(Protocol): """ - def any(self, *, skip_nulls: BoolScalar = True) -> DataFrame: + def any(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def all(self, *, skip_nulls: BoolScalar = True) -> DataFrame: + def all(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def min(self, *, skip_nulls: BoolScalar = True) -> DataFrame: + def min(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def max(self, *, skip_nulls: BoolScalar = True) -> DataFrame: + def max(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def sum(self, *, skip_nulls: BoolScalar = True) -> DataFrame: + def sum(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def prod(self, *, skip_nulls: BoolScalar = True) -> DataFrame: + def prod(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def median(self, *, skip_nulls: BoolScalar = True) -> DataFrame: + def median(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... - def mean(self, *, skip_nulls: BoolScalar = True) -> DataFrame: + def mean(self, *, skip_nulls: bool | Scalar = True) -> DataFrame: ... def std( self, *, - correction: NumericScalar = 1, - skip_nulls: BoolScalar = True, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, ) -> DataFrame: ... def var( self, *, - correction: NumericScalar = 1, - skip_nulls: BoolScalar = True, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, ) -> DataFrame: ... @@ -85,7 +85,7 @@ def aggregate(self, *aggregation: Aggregation) -> DataFrame: class Aggregation(Protocol): - def rename(self, name: StringScalar) -> Aggregation: + def rename(self, name: str | Scalar) -> Aggregation: """Assign given name to output of aggregation. If not called, the column's name will be used as the output name. @@ -93,35 +93,35 @@ def rename(self, name: StringScalar) -> Aggregation: ... @classmethod - def any(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: + def any(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def all(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: + def all(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def min(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: + def min(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def max(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: + def max(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def sum(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: + def sum(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def prod(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: + def prod(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def median(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: + def median(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod - def mean(cls, column: str, *, skip_nulls: BoolScalar = True) -> Aggregation: + def mean(cls, column: str, *, skip_nulls: bool | Scalar = True) -> Aggregation: ... @classmethod @@ -129,8 +129,8 @@ def std( cls, column: str, *, - correction: NumericScalar = 1, - skip_nulls: BoolScalar = True, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, ) -> Aggregation: ... @@ -139,8 +139,8 @@ def var( cls, column: str, *, - correction: NumericScalar = 1, - skip_nulls: BoolScalar = True, + correction: float | Scalar = 1, + skip_nulls: bool | Scalar = True, ) -> Aggregation: ... diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py index 2f0012a0..3bbd8d0d 100644 --- a/spec/API_specification/dataframe_api/scalar_object.py +++ b/spec/API_specification/dataframe_api/scalar_object.py @@ -93,7 +93,7 @@ def __bool__(self) -> bool: @property def dtype(self) -> DType: - """Return data type of column.""" + """Return data type of scalar.""" ... def persist(self) -> object: diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py index 10c111a6..574c9480 100644 --- a/spec/API_specification/dataframe_api/typing.py +++ b/spec/API_specification/dataframe_api/typing.py @@ -168,11 +168,6 @@ def __column_consortium_standard__( PythonScalar = Union[str, int, float, bool] -BoolScalar = Union[bool, Scalar] -FloatScalar = Union[float, Scalar] -IntScalar = Union[int, Scalar] -NumericScalar = Union[FloatScalar, IntScalar] -StringScalar = Union[str, Scalar] AnyScalar = Union[PythonScalar, Scalar] NullType = Namespace.NullType @@ -183,14 +178,8 @@ def __column_consortium_standard__( "DType", "GroupBy", "Namespace", + "AnyScalar", "Scalar", - "PythonScalar", - "FloatScalar", - "IntScalar", - "BoolScalar", - "StringScalar", - "IntScalar", - "NumericScalar", "NullType", "SupportsColumnAPI", "SupportsDataFrameAPI", diff --git a/spec/conf.py b/spec/conf.py index 8f9dd5ff..8690fb84 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -84,10 +84,10 @@ ('py:class', 'Scalar'), ('py:class', 'AnyScalar'), ('py:class', 'BoolScalar'), - ('py:class', 'FloatScalar'), + ('py:class', 'float | Scalar'), ('py:class', 'NullType'), - ('py:class', 'StringScalar'), - ('py:class', 'IntScalar'), + ('py:class', 'str | Scalar'), + ('py:class', 'int | Scalar'), ('py:class', 'NumericScalar'), ('py:class', 'Bool'), ('py:class', 'optional'), diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 9e9fc323..11d5db54 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -18,7 +18,7 @@ class DataFrame: ... class Column: - def mean(self, skip_nulls: BoolScalar = True) -> Scalar: + def mean(self, skip_nulls: bool | Scalar = True) -> Scalar: ... larger = df2 > df1.col('foo').mean() From bee402f2335d0202967c4dac5a8ffcdb6c4b741c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 20:21:53 +0000 Subject: [PATCH 21/26] update python builtin types desc --- .../dataframe_api/scalar_object.py | 8 +++++++ spec/design_topics/python_builtin_types.md | 22 ++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py index 3bbd8d0d..0642c26e 100644 --- a/spec/API_specification/dataframe_api/scalar_object.py +++ b/spec/API_specification/dataframe_api/scalar_object.py @@ -14,6 +14,14 @@ class Scalar(Protocol): Not meant to be instantiated directly, but rather created via `:meth:Column.get_value` or one of the column reductions such as `:meth:`Column.sum`. + + Note that, just like how `:class:Column`s can hold null values, + a `Scalar` can also be backed by a null value. Given that `Scalar`s + aren't instantiated directly, but rather derived from existing + `Column`s, `Scalar.dtype` is determined by the parent `Column`. + For example, if `column` is `Column` of dtype `Int64`, then + `column.get_value(0)` will return a `Scalar` of dtype `Int64` + (even if it is backed by a null value). """ def __lt__(self, other: AnyScalar) -> Scalar: diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 11d5db54..be2cde0c 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -6,9 +6,9 @@ it is also potentially problematic when trying to write performant dataframe library code or supporting devices other than CPU. This standard specifies the use of Python types in quite a few places, and uses -them as type annotations. As a concrete example, consider the `mean` method and -the `float` it is documented to return, in combination with the `__gt__` method -(i.e., the `>` operator) on the dataframe: +them as type annotations. As a concrete example, consider the `mean` method, +the `bool | Scalar` argument it takes, and the `Scalar` it is documented to return, +in combination with the `__gt__` method (i.e., the `>` operator) on the dataframe: ```python class DataFrame: @@ -21,15 +21,17 @@ class Column: def mean(self, skip_nulls: bool | Scalar = True) -> Scalar: ... -larger = df2 > df1.col('foo').mean() +larger = df2 > df1.col('foo', skip_nulls = True).mean() ``` -For a GPU dataframe library, it is desirable for all data to reside on the GPU, -and not incur a performance penalty from synchronizing instances of Python -builtin types to CPU. In the above example, the `.mean()` call returns a -`Scalar`. It is likely beneficial though to implement this as a library-specific -scalar object which (partially) duck types with `float`. The required methods it -must implement are listed in the spec for class `Scalar`. +Let's go through these arguments: +- `skip_nulls: bool | Scalar`. This means we can either pass a Python `bool`, or + a `Scalar` object backed by a boolean; +- the return value of `.mean()` is a `Scalar` +- the return value of `__gt__` is also a `Scalar`. + +This allows scalars to reside on different devices (e.g. GPU), or to stay lazy +(if a library allows that). ## Example From 15090ac6e6ebc20cd93ee4241453465d995bbee8 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 20:23:23 +0000 Subject: [PATCH 22/26] fixup --- spec/API_specification/dataframe_api/column_object.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 841b8837..7d70eb1b 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -5,11 +5,9 @@ if TYPE_CHECKING: from typing_extensions import Self - from dataframe_api.dataframe_object import DataFrame - - from .scalar_object import Scalar from .typing import ( AnyScalar, + DataFrame, DType, Namespace, NullType, From 24d2ad83b1ae7ff40fa2881426ff13471b66495c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 20:35:30 +0000 Subject: [PATCH 23/26] enable extra ruff rule, note AnyScalar --- spec/API_specification/pyproject.toml | 1 - spec/design_topics/python_builtin_types.md | 9 ++++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/spec/API_specification/pyproject.toml b/spec/API_specification/pyproject.toml index 8b88d455..31c5f2ad 100644 --- a/spec/API_specification/pyproject.toml +++ b/spec/API_specification/pyproject.toml @@ -47,5 +47,4 @@ ignore = [ "N999", # invalid-module-name "PD901", # pandas-df-variable-name "PLR0913", # too-many-arguments - "PYI041", # redundant-numeric-union ] diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index be2cde0c..ee0c3541 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -28,10 +28,13 @@ Let's go through these arguments: - `skip_nulls: bool | Scalar`. This means we can either pass a Python `bool`, or a `Scalar` object backed by a boolean; - the return value of `.mean()` is a `Scalar` -- the return value of `__gt__` is also a `Scalar`. +- the argument `other` of `__gt__` is typed as `AnyScalar`, meaning that we can + compare a `DataFrame` with a Python scalar (e.g. `df > 3`) or with a `Scalar` + (e.g. `df > df.col('a').mean()`) +- the return value of `__gt__` is a `Scalar` -This allows scalars to reside on different devices (e.g. GPU), or to stay lazy -(if a library allows that). +Returning values as `Scalar` allows scalars to reside on different devices (e.g. GPU), +or to stay lazy (if a library allows it). ## Example From 8360d96da4fe20762f7a92764f47415ea7f0052d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 15 Nov 2023 20:36:28 +0000 Subject: [PATCH 24/26] remove some unnecessary nitpick ignores --- spec/conf.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/spec/conf.py b/spec/conf.py index 8690fb84..a89cfee5 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -83,12 +83,7 @@ ('py:class', 'ellipsis'), ('py:class', 'Scalar'), ('py:class', 'AnyScalar'), - ('py:class', 'BoolScalar'), - ('py:class', 'float | Scalar'), ('py:class', 'NullType'), - ('py:class', 'str | Scalar'), - ('py:class', 'int | Scalar'), - ('py:class', 'NumericScalar'), ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'Aggregation'), From f69679aa0aa66481d04b87e00b04e00c0e573102 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 17 Nov 2023 12:39:21 +0000 Subject: [PATCH 25/26] return Self from Scalar.persist, add column.persist --- .../dataframe_api/column_object.py | 13 +++++++++++++ .../dataframe_api/scalar_object.py | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 7d70eb1b..9ae7de4a 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -1044,3 +1044,16 @@ def unix_timestamp(self, *, time_unit: str | Scalar = "s") -> Self: discarded. """ ... + + def persist(self) -> Self: + """Hint that computation prior to this point should not be repeated. + + This is intended as a hint, rather than as a directive. Implementations + which do not separate lazy vs eager execution may ignore this method and + treat it as a no-op. + + .. note:: + This method may trigger execution. If necessary, it should be called + at most once per dataframe, and as late as possible in the pipeline. + """ + ... diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py index 0642c26e..d9d8c883 100644 --- a/spec/API_specification/dataframe_api/scalar_object.py +++ b/spec/API_specification/dataframe_api/scalar_object.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Protocol +from typing import TYPE_CHECKING, Protocol, Any if TYPE_CHECKING: from dataframe_api.typing import AnyScalar, DType @@ -104,7 +104,7 @@ def dtype(self) -> DType: """Return data type of scalar.""" ... - def persist(self) -> object: + def persist(self) -> Self: """Hint that computation prior to this point should not be repeated. This is intended as a hint, rather than as a directive. Implementations From 216b5e61b44cbbde294c2f95bc676c0d8ec2316e Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 17 Nov 2023 12:40:01 +0000 Subject: [PATCH 26/26] fixup --- spec/API_specification/dataframe_api/scalar_object.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py index d9d8c883..078cf2a4 100644 --- a/spec/API_specification/dataframe_api/scalar_object.py +++ b/spec/API_specification/dataframe_api/scalar_object.py @@ -1,8 +1,10 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Protocol, Any +from typing import TYPE_CHECKING, Protocol if TYPE_CHECKING: + from typing_extensions import Self + from dataframe_api.typing import AnyScalar, DType __all__ = ["Scalar"]