diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 07876344..f5c17d80 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -92,7 +92,7 @@ def group_by(self, *keys: str) -> GroupBy: """ ... - def get_column_by_name(self, name: str, /) -> Column: + def col(self, name: str, /) -> Column: """ Select a column by name. @@ -195,7 +195,7 @@ def assign(self, *columns: Column) -> Self: .. code-block:: python - new_column = df.get_column_by_name('a') + 1 + new_column = df.col('a') + 1 df = df.assign(new_column.rename('b')) Parameters diff --git a/spec/API_specification/dataframe_api/py.typed b/spec/API_specification/dataframe_api/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/spec/API_specification/examples/01_standardise_columns.py b/spec/API_specification/examples/01_standardise_columns.py index 476a4b10..e7b9d78e 100644 --- a/spec/API_specification/examples/01_standardise_columns.py +++ b/spec/API_specification/examples/01_standardise_columns.py @@ -11,7 +11,7 @@ def my_dataframe_agnostic_function(df_non_standard: SupportsDataFrameAPI) -> Any for column_name in df.column_names: if column_name == 'species': continue - new_column = df.get_column_by_name(column_name) + new_column = df.col(column_name) new_column = (new_column - new_column.mean()) / new_column.std() df = df.assign(new_column.rename(f'{column_name}_scaled')) diff --git a/spec/API_specification/examples/02_plotting.py b/spec/API_specification/examples/02_plotting.py index 7a8dd701..f29985ea 100644 --- a/spec/API_specification/examples/02_plotting.py +++ b/spec/API_specification/examples/02_plotting.py @@ -25,7 +25,7 @@ def group_by_and_plot( ) agg = df.group_by("color").mean().fill_null(float('nan')) - x = agg.get_column_by_name("x").to_array() - y = agg.get_column_by_name("y").to_array() + x = agg.col("x").to_array() + y = agg.col("y").to_array() my_plotting_function(x, y) diff --git a/spec/API_specification/examples/tpch/q1.py b/spec/API_specification/examples/tpch/q1.py index b5c11287..21a1f4a5 100644 --- a/spec/API_specification/examples/tpch/q1.py +++ b/spec/API_specification/examples/tpch/q1.py @@ -8,16 +8,16 @@ def query(lineitem_raw: SupportsDataFrameAPI) -> Any: lineitem = lineitem_raw.__dataframe_consortium_standard__() namespace = lineitem.__dataframe_namespace__() - mask = lineitem.get_column_by_name("l_shipdate") <= namespace.date(1998, 9, 2) + mask = lineitem.col("l_shipdate") <= namespace.date(1998, 9, 2) lineitem = lineitem.assign( ( - lineitem.get_column_by_name("l_extended_price") - * (1 - lineitem.get_column_by_name("l_discount")) + lineitem.col("l_extended_price") + * (1 - lineitem.col("l_discount")) ).rename("l_disc_price"), ( - lineitem.get_column_by_name("l_extended_price") - * (1 - lineitem.get_column_by_name("l_discount")) - * (1 + lineitem.get_column_by_name("l_tax")) + lineitem.col("l_extended_price") + * (1 - lineitem.col("l_discount")) + * (1 + lineitem.col("l_tax")) ).rename("l_charge"), ) result = ( diff --git a/spec/API_specification/examples/tpch/q5.py b/spec/API_specification/examples/tpch/q5.py index 332967c7..15902f8a 100644 --- a/spec/API_specification/examples/tpch/q5.py +++ b/spec/API_specification/examples/tpch/q5.py @@ -53,19 +53,15 @@ def query( ) ) mask = ( - ( - result.get_column_by_name("c_nationkey") - == result.get_column_by_name("s_nationkey") - ) - & (result.get_column_by_name("r_name") == "ASIA") - & (result.get_column_by_name("o_orderdate") >= namespace.date(1994, 1, 1)) - & (result.get_column_by_name("o_orderdate") < namespace.date(1995, 1, 1)) + (result.col("c_nationkey") == result.col("s_nationkey")) + & (result.col("r_name") == "ASIA") + & (result.col("o_orderdate") >= namespace.date(1994, 1, 1)) + & (result.col("o_orderdate") < namespace.date(1995, 1, 1)) ) result = result.filter(mask) new_column = ( - result.get_column_by_name("l_extendedprice") - * (1 - result.get_column_by_name("l_discount")) + result.col("l_extendedprice") * (1 - result.col("l_discount")) ).rename("revenue") result = result.assign(new_column) result = result.group_by("n_name").aggregate(namespace.Aggregation.sum("revenue")) diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 93de5c53..c85812eb 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -14,14 +14,14 @@ the `float` it is documented to return, in combination with the `__gt__` method class DataFrame: def __gt__(self, other: DataFrame | Scalar) -> DataFrame: ... - def get_column_by_name(self, name: str, /) -> Column: + def col(self, name: str, /) -> Column: ... class Column: def mean(self, skip_nulls: bool = True) -> float | NullType: ... -larger = df2 > df1.get_column_by_name('foo').mean() +larger = df2 > df1.col('foo').mean() ``` For a GPU dataframe library, it is desirable for all data to reside on the GPU,