Skip to content

Commit afa19ab

Browse files
authored
Merge pull request #13 from data-apis/remove-column-add-expression
Separate lazy and eager
2 parents b27faa2 + 801cbf1 commit afa19ab

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+3189
-1682
lines changed

Diff for: .github/workflows/tox.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
tox:
1010
strategy:
1111
matrix:
12-
python-version: ["3.8", "3.9", "3.10", "3.11"]
12+
python-version: ["3.9", "3.10", "3.11"]
1313
os: [windows-latest, ubuntu-latest]
1414

1515
runs-on: ${{ matrix.os }}

Diff for: dataframe_api_compat/pandas_standard/__init__.py

+169-54
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from __future__ import annotations
22

3+
import re
34
from typing import Any
5+
from typing import Literal
46
from typing import TYPE_CHECKING
57

68
import pandas as pd
@@ -10,12 +12,24 @@
1012
from dataframe_api_compat.pandas_standard.pandas_standard import PandasColumn
1113
from dataframe_api_compat.pandas_standard.pandas_standard import PandasDataFrame
1214
from dataframe_api_compat.pandas_standard.pandas_standard import PandasGroupBy
15+
from dataframe_api_compat.pandas_standard.pandas_standard import PandasPermissiveColumn
16+
from dataframe_api_compat.pandas_standard.pandas_standard import PandasPermissiveFrame
1317

1418
if TYPE_CHECKING:
1519
from collections.abc import Sequence
20+
from dataframe_api._types import DType
21+
22+
23+
def col(name: str) -> PandasColumn:
24+
return PandasColumn(
25+
root_names=[name], output_name=name, base_call=lambda df: df.loc[:, name]
26+
)
27+
1628

1729
Column = PandasColumn
30+
PermissiveColumn = PandasPermissiveColumn
1831
DataFrame = PandasDataFrame
32+
PermissiveFrame = PandasPermissiveFrame
1933
GroupBy = PandasGroupBy
2034

2135

@@ -67,35 +81,82 @@ class String:
6781
...
6882

6983

70-
DTYPE_MAP = {
71-
"int64": Int64(),
72-
"Int64": Int64(),
73-
"int32": Int32(),
74-
"Int32": Int32(),
75-
"int16": Int16(),
76-
"Int16": Int16(),
77-
"int8": Int8(),
78-
"Int8": Int8(),
79-
"uint64": UInt64(),
80-
"UInt64": UInt64(),
81-
"uint32": UInt32(),
82-
"UInt32": UInt32(),
83-
"uint16": UInt16(),
84-
"UInt16": UInt16(),
85-
"uint8": UInt8(),
86-
"UInt8": UInt8(),
87-
"float64": Float64(),
88-
"Float64": Float64(),
89-
"float32": Float32(),
90-
"Float32": Float32(),
91-
"bool": Bool(),
92-
"boolean": Bool(),
93-
"object": String(),
94-
"string": String(),
95-
}
96-
97-
98-
def map_standard_dtype_to_pandas_dtype(dtype: Any) -> Any:
84+
class Date:
85+
...
86+
87+
88+
class Datetime:
89+
def __init__(self, time_unit, time_zone=None):
90+
self.time_unit = time_unit
91+
# todo validate time zone
92+
self.time_zone = time_zone
93+
94+
95+
class Duration:
96+
def __init__(self, time_unit):
97+
self.time_unit = time_unit
98+
99+
100+
def map_pandas_dtype_to_standard_dtype(dtype: Any) -> DType:
101+
if dtype == "int64":
102+
return Int64()
103+
if dtype == "Int64":
104+
return Int64()
105+
if dtype == "int32":
106+
return Int32()
107+
if dtype == "Int32":
108+
return Int32()
109+
if dtype == "int16":
110+
return Int16()
111+
if dtype == "Int16":
112+
return Int16()
113+
if dtype == "int8":
114+
return Int8()
115+
if dtype == "Int8":
116+
return Int8()
117+
if dtype == "uint64":
118+
return UInt64()
119+
if dtype == "UInt64":
120+
return UInt64()
121+
if dtype == "uint32":
122+
return UInt32()
123+
if dtype == "UInt32":
124+
return UInt32()
125+
if dtype == "uint16":
126+
return UInt16()
127+
if dtype == "UInt16":
128+
return UInt16()
129+
if dtype == "uint8":
130+
return UInt8()
131+
if dtype == "UInt8":
132+
return UInt8()
133+
if dtype == "float64":
134+
return Float64()
135+
if dtype == "Float64":
136+
return Float64()
137+
if dtype == "float32":
138+
return Float32()
139+
if dtype == "Float32":
140+
return Float32()
141+
if dtype == "bool":
142+
# 'boolean' not yet covered, as the default dtype in pandas is still 'bool'
143+
return Bool()
144+
if dtype == "object":
145+
return String()
146+
if dtype == "string":
147+
return String()
148+
if dtype == "datetime64[s]":
149+
return Date()
150+
if dtype.startswith("datetime64["):
151+
time_unit = re.search(r"datetime64\[(\w{1,2})", dtype).group(1)
152+
return Datetime(time_unit)
153+
if dtype.startswith("timedelta64["):
154+
time_unit = re.search(r"timedelta64\[(\w{1,2})", dtype).group(1)
155+
return Duration(time_unit)
156+
raise AssertionError(f"Unsupported dtype! {dtype}")
157+
158+
159+
def map_standard_dtype_to_pandas_dtype(dtype: DType) -> Any:
99160
if isinstance(dtype, Int64):
100161
return "int64"
101162
if isinstance(dtype, Int32):
@@ -120,9 +181,26 @@ def map_standard_dtype_to_pandas_dtype(dtype: Any) -> Any:
120181
return "bool"
121182
if isinstance(dtype, String):
122183
return "object"
184+
if isinstance(dtype, Datetime):
185+
if dtype.time_zone is not None: # pragma: no cover (todo)
186+
return f"datetime64[{dtype.time_unit}, {dtype.time_zone}]"
187+
return f"datetime64[{dtype.time_unit}]"
188+
if isinstance(dtype, Duration):
189+
return f"timedelta64[{dtype.time_unit}]"
123190
raise AssertionError(f"Unknown dtype: {dtype}")
124191

125192

193+
def convert_to_standard_compliant_column(
194+
ser: pd.Series, api_version: str | None = None
195+
) -> PandasDataFrame:
196+
if api_version is None: # pragma: no cover
197+
api_version = LATEST_API_VERSION
198+
if ser.name is not None and not isinstance(ser.name, str):
199+
raise ValueError(f"Expected column with string name, got: {ser.name}")
200+
name = ser.name or ""
201+
return PandasPermissiveColumn(ser.rename(name), api_version=api_version)
202+
203+
126204
def convert_to_standard_compliant_dataframe(
127205
df: pd.DataFrame, api_version: str | None = None
128206
) -> PandasDataFrame:
@@ -131,13 +209,6 @@ def convert_to_standard_compliant_dataframe(
131209
return PandasDataFrame(df, api_version=api_version)
132210

133211

134-
def convert_to_standard_compliant_column(
135-
df: pd.Series[Any],
136-
api_version: str | None = None,
137-
) -> PandasColumn[Any]:
138-
return PandasColumn(df, api_version=api_version or LATEST_API_VERSION)
139-
140-
141212
def concat(dataframes: Sequence[PandasDataFrame]) -> PandasDataFrame:
142213
dtypes = dataframes[0].dataframe.dtypes
143214
dfs = []
@@ -164,16 +235,30 @@ def concat(dataframes: Sequence[PandasDataFrame]) -> PandasDataFrame:
164235

165236
def column_from_sequence(
166237
sequence: Sequence[Any], *, dtype: Any, name: str, api_version: str | None = None
167-
) -> PandasColumn[Any]:
238+
) -> PandasPermissiveColumn[Any]:
168239
ser = pd.Series(sequence, dtype=map_standard_dtype_to_pandas_dtype(dtype), name=name)
169-
return PandasColumn(ser, api_version=LATEST_API_VERSION)
240+
return PandasPermissiveColumn(ser, api_version=api_version or LATEST_API_VERSION)
241+
242+
243+
def dataframe_from_dict(
244+
data: dict[str, PandasPermissiveColumn[Any]], api_version: str | None = None
245+
) -> PandasDataFrame:
246+
for _, col in data.items():
247+
if not isinstance(col, PandasPermissiveColumn): # pragma: no cover
248+
raise TypeError(f"Expected PandasPermissiveColumn, got {type(col)}")
249+
return PandasDataFrame(
250+
pd.DataFrame(
251+
{label: column.column.rename(label) for label, column in data.items()}
252+
),
253+
api_version=api_version or LATEST_API_VERSION,
254+
)
170255

171256

172257
def column_from_1d_array(
173258
data: Any, *, dtype: Any, name: str | None = None, api_version: str | None = None
174-
) -> PandasColumn[Any]: # pragma: no cover
259+
) -> PandasPermissiveColumn[Any]: # pragma: no cover
175260
ser = pd.Series(data, dtype=map_standard_dtype_to_pandas_dtype(dtype), name=name)
176-
return PandasColumn(ser, api_version=api_version or LATEST_API_VERSION)
261+
return PandasPermissiveColumn(ser, api_version=api_version or LATEST_API_VERSION)
177262

178263

179264
def dataframe_from_2d_array(
@@ -189,20 +274,6 @@ def dataframe_from_2d_array(
189274
return PandasDataFrame(df, api_version=api_version or LATEST_API_VERSION)
190275

191276

192-
def dataframe_from_dict(
193-
data: dict[str, PandasColumn[Any]], api_version: str | None = None
194-
) -> PandasDataFrame:
195-
for _, col in data.items():
196-
if not isinstance(col, PandasColumn): # pragma: no cover
197-
raise TypeError(f"Expected PandasColumn, got {type(col)}")
198-
return PandasDataFrame(
199-
pd.DataFrame(
200-
{label: column.column.rename(label) for label, column in data.items()}
201-
),
202-
api_version=api_version or LATEST_API_VERSION,
203-
)
204-
205-
206277
def is_null(value: Any) -> bool:
207278
return value is null
208279

@@ -223,3 +294,47 @@ def is_dtype(dtype: Any, kind: str | tuple[str, ...]) -> bool:
223294
if _kind == "string":
224295
dtypes.add(String)
225296
return isinstance(dtype, tuple(dtypes))
297+
298+
299+
def any_rowwise(*columns: str, skip_nulls: bool = True) -> PandasColumn:
300+
# todo: accept expressions
301+
def func(df):
302+
return df.loc[:, list(columns) or df.columns.tolist()].any(axis=1)
303+
304+
return PandasColumn(root_names=list(columns), output_name="any", base_call=func)
305+
306+
307+
def all_rowwise(*columns: str, skip_nulls: bool = True) -> PandasColumn:
308+
def func(df: pd.DataFrame) -> pd.Series:
309+
return df.loc[:, list(columns) or df.columns.tolist()].all(axis=1)
310+
311+
return PandasColumn(root_names=list(columns), output_name="all", base_call=func)
312+
313+
314+
def sorted_indices(
315+
*keys: str,
316+
ascending: Sequence[bool] | bool = True,
317+
nulls_position: Literal["first", "last"] = "last",
318+
) -> Column:
319+
def func(df: pd.DataFrame) -> pd.Series:
320+
if ascending:
321+
return (
322+
df.loc[:, list(keys)]
323+
.sort_values(list(keys))
324+
.index.to_series()
325+
.reset_index(drop=True)
326+
)
327+
return (
328+
df.loc[:, list(keys)]
329+
.sort_values(list(keys))
330+
.index.to_series()[::-1]
331+
.reset_index(drop=True)
332+
)
333+
334+
return PandasColumn(root_names=list(keys), output_name="indices", base_call=func)
335+
336+
337+
def unique_indices(
338+
keys: str | list[str] | None = None, *, skip_nulls: bool = True
339+
) -> Column:
340+
raise NotImplementedError("namespace.unique_indices not implemented for pandas yet")

0 commit comments

Comments
 (0)