-
Notifications
You must be signed in to change notification settings - Fork 21
RFC Replace column with expression #247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
+355
−463
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,21 +3,20 @@ | |
""" | ||
from __future__ import annotations | ||
|
||
from typing import Mapping, Sequence, Any | ||
from typing import Mapping, Sequence, Any, Literal | ||
|
||
from .column_object import * | ||
from .expression_object import * | ||
from .dataframe_object import DataFrame | ||
from .groupby_object import * | ||
from ._types import DType | ||
|
||
__all__ = [ | ||
"__dataframe_api_version__", | ||
"DataFrame", | ||
"Column", | ||
"column_from_sequence", | ||
"column_from_1d_array", | ||
"col", | ||
"concat", | ||
"dataframe_from_dict", | ||
"sorted_indices", | ||
"unique_indices", | ||
"dataframe_from_2d_array", | ||
"is_null", | ||
"null", | ||
|
@@ -43,6 +42,21 @@ | |
implementation of the dataframe API standard. | ||
""" | ||
|
||
def col(name: str) -> Expression: | ||
""" | ||
Instantiate an Expression which selects given column by name. | ||
|
||
For example, to select column 'species' and then use it to filter | ||
a DataFrame, you could do: | ||
|
||
.. code-block::python | ||
|
||
df: DataFrame | ||
namespace = df.__dataframe_namespace__() | ||
df.get_rows_by_mask(pl.col('species') == 'setosa') | ||
""" | ||
... | ||
|
||
def concat(dataframes: Sequence[DataFrame]) -> DataFrame: | ||
""" | ||
Concatenate DataFrames vertically. | ||
|
@@ -63,104 +77,116 @@ def concat(dataframes: Sequence[DataFrame]) -> DataFrame: | |
""" | ||
... | ||
|
||
def column_from_sequence(sequence: Sequence[Any], *, dtype: Any, name: str = '', api_version: str | None = None) -> Column[Any]: | ||
def any_rowwise(keys: list[str] | None = None, *, skip_nulls: bool = True) -> Expression: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Hence, I'm moving it to a top-level function example usage: def my_agnostic_func(df):
df = df.__dataframe_consortium_standard__()
namespace = df.__dataframe_namespace__()
result = df.get_rows_by_mask(namespace.any_rowwise())
return result.dataframe |
||
""" | ||
Construct Column from sequence of elements. | ||
Reduction returns an Expression. | ||
|
||
Differs from ``DataFrame.any`` and that the reduction happens | ||
for each row, rather than for each column. | ||
|
||
Parameters | ||
---------- | ||
sequence : Sequence[object] | ||
Sequence of elements. Each element must be of the specified | ||
``dtype``, the corresponding Python builtin scalar type, or | ||
coercible to that Python scalar type. | ||
name : str, optional | ||
Name of column. | ||
dtype : DType | ||
Dtype of result. Must be specified. | ||
api_version: str | None | ||
A string representing the version of the dataframe API specification | ||
in ``'YYYY.MM'`` form, for example, ``'2023.04'``. | ||
If it is ``None``, it should return an object corresponding to | ||
latest version of the dataframe API specification. If the given | ||
version is invalid or not implemented for the given module, an | ||
error should be raised. Default: ``None``. | ||
keys : list[str] | ||
Column names to consider. If `None`, all columns are considered. | ||
|
||
Returns | ||
------- | ||
Column | ||
Raises | ||
------ | ||
ValueError | ||
If any of the DataFrame's columns is not boolean. | ||
""" | ||
... | ||
|
||
def dataframe_from_dict(data: Mapping[str, Column[Any]], *, api_version: str | None = None) -> DataFrame: | ||
def all_rowwise(keys: list[str] | None = None, *, skip_nulls: bool = True) -> Expression: | ||
""" | ||
Construct DataFrame from map of column names to Columns. | ||
Reduction returns a Column. | ||
|
||
Differs from ``DataFrame.all`` and that the reduction happens | ||
for each row, rather than for each column. | ||
|
||
Parameters | ||
---------- | ||
data : Mapping[str, Column] | ||
Column must be of the corresponding type of the DataFrame. | ||
For example, it is only supported to build a ``LibraryXDataFrame`` using | ||
``LibraryXColumn`` instances. | ||
api_version: str | None | ||
A string representing the version of the dataframe API specification | ||
in ``'YYYY.MM'`` form, for example, ``'2023.04'``. | ||
If it is ``None``, it should return an object corresponding to | ||
latest version of the dataframe API specification. If the given | ||
version is invalid or not implemented for the given module, an | ||
error should be raised. Default: ``None``. | ||
keys : list[str] | ||
Column names to consider. If `None`, all columns are considered. | ||
|
||
Returns | ||
------- | ||
DataFrame | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If any of the columns already has a name, and the corresponding key | ||
in `data` doesn't match. | ||
|
||
If any of the DataFrame's columns is not boolean. | ||
""" | ||
... | ||
|
||
def sorted_indices( | ||
keys: str | list[str] | None = None, | ||
*, | ||
ascending: Sequence[bool] | bool = True, | ||
nulls_position: Literal['first', 'last'] = 'last', | ||
) -> Expression: | ||
""" | ||
Return row numbers which would sort according to given columns. | ||
|
||
If you need to sort the DataFrame, use :meth:`DataFrame.sort`. | ||
|
||
def column_from_1d_array(array: Any, *, dtype: Any, name: str = '', api_version: str | None = None) -> Column[Any]: | ||
Parameters | ||
---------- | ||
keys : str | list[str], optional | ||
Names of columns to sort by. | ||
If `None`, sort by all columns. | ||
ascending : Sequence[bool] or bool | ||
If `True`, sort by all keys in ascending order. | ||
If `False`, sort by all keys in descending order. | ||
If a sequence, it must be the same length as `keys`, | ||
and determines the direction with which to use each | ||
key to sort by. | ||
nulls_position : ``{'first', 'last'}`` | ||
Whether null values should be placed at the beginning | ||
or at the end of the result. | ||
Note that the position of NaNs is unspecified and may | ||
vary based on the implementation. | ||
|
||
Returns | ||
------- | ||
Expression | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If `keys` and `ascending` are sequences of different lengths. | ||
""" | ||
Construct Column from 1D array. | ||
... | ||
|
||
See `dataframe_from_2d_array` for related 2D function. | ||
|
||
Only Array-API-compliant 1D arrays are supported. | ||
Cross-kind casting is undefined and may vary across implementations. | ||
Downcasting is disallowed. | ||
def unique_indices(keys: str | list[str] | None = None, *, skip_nulls: bool = True) -> Expression: | ||
""" | ||
Return indices corresponding to unique values across selected columns. | ||
|
||
Parameters | ||
---------- | ||
array : array | ||
array-API compliant 1D array | ||
name : str, optional | ||
Name to give columns. | ||
dtype : DType | ||
Dtype of column. | ||
api_version: str | None | ||
A string representing the version of the dataframe API specification | ||
in ``'YYYY.MM'`` form, for example, ``'2023.04'``. | ||
If it is ``None``, it should return an object corresponding to | ||
latest version of the dataframe API specification. If the given | ||
version is invalid or not implemented for the given module, an | ||
error should be raised. Default: ``None``. | ||
keys : str | list[str], optional | ||
Column names to consider when finding unique values. | ||
If `None`, all columns are considered. | ||
|
||
Returns | ||
------- | ||
Column | ||
Expression | ||
Indices corresponding to unique values. | ||
|
||
Notes | ||
----- | ||
There are no ordering guarantees. In particular, if there are multiple | ||
indices corresponding to the same unique value(s), there is no guarantee | ||
about which one will appear in the result. | ||
If the original column(s) contain multiple `'NaN'` values, then | ||
only a single index corresponding to those values will be returned. | ||
Likewise for null values (if ``skip_nulls=False``). | ||
To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``. | ||
""" | ||
... | ||
|
||
def dataframe_from_2d_array(array: Any, *, names: Sequence[str], dtypes: Mapping[str, Any], api_version: str | None = None) -> DataFrame: | ||
|
||
def dataframe_from_2d_array(array: Any, *, names: Sequence[str], dtypes: Mapping[str, Any]) -> DataFrame: | ||
""" | ||
Construct DataFrame from 2D array. | ||
|
||
See `column_from_1d_array` for related 1D function. | ||
|
||
Only Array-API-compliant 2D arrays are supported. | ||
Cross-kind casting is undefined and may vary across implementations. | ||
Downcasting is disallowed. | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if we get rid of Column, then we can't initialise a column. So,
column_from_sequence
,column_from_1d_array
, anddataframe_from_dict
would need to go, and the only initialiser left would bedataframe_from_2d_array
maybe we can consider adding others, but I think this is fine for now (and the only one which scikit-learn would probably need, if they're converting to ndarray and then converting back to dataframe)