1
1
from __future__ import annotations
2
2
3
+ import re
3
4
from typing import Any
5
+ from typing import Literal
4
6
from typing import TYPE_CHECKING
5
7
6
8
import pandas as pd
10
12
from dataframe_api_compat .pandas_standard .pandas_standard import PandasColumn
11
13
from dataframe_api_compat .pandas_standard .pandas_standard import PandasDataFrame
12
14
from dataframe_api_compat .pandas_standard .pandas_standard import PandasGroupBy
15
+ from dataframe_api_compat .pandas_standard .pandas_standard import PandasPermissiveColumn
16
+ from dataframe_api_compat .pandas_standard .pandas_standard import PandasPermissiveFrame
13
17
14
18
if TYPE_CHECKING :
15
19
from collections .abc import Sequence
20
+ from dataframe_api ._types import DType
21
+
22
+
23
+ def col (name : str ) -> PandasColumn :
24
+ return PandasColumn (
25
+ root_names = [name ], output_name = name , base_call = lambda df : df .loc [:, name ]
26
+ )
27
+
16
28
17
29
Column = PandasColumn
30
+ PermissiveColumn = PandasPermissiveColumn
18
31
DataFrame = PandasDataFrame
32
+ PermissiveFrame = PandasPermissiveFrame
19
33
GroupBy = PandasGroupBy
20
34
21
35
@@ -67,35 +81,82 @@ class String:
67
81
...
68
82
69
83
70
- DTYPE_MAP = {
71
- "int64" : Int64 (),
72
- "Int64" : Int64 (),
73
- "int32" : Int32 (),
74
- "Int32" : Int32 (),
75
- "int16" : Int16 (),
76
- "Int16" : Int16 (),
77
- "int8" : Int8 (),
78
- "Int8" : Int8 (),
79
- "uint64" : UInt64 (),
80
- "UInt64" : UInt64 (),
81
- "uint32" : UInt32 (),
82
- "UInt32" : UInt32 (),
83
- "uint16" : UInt16 (),
84
- "UInt16" : UInt16 (),
85
- "uint8" : UInt8 (),
86
- "UInt8" : UInt8 (),
87
- "float64" : Float64 (),
88
- "Float64" : Float64 (),
89
- "float32" : Float32 (),
90
- "Float32" : Float32 (),
91
- "bool" : Bool (),
92
- "boolean" : Bool (),
93
- "object" : String (),
94
- "string" : String (),
95
- }
96
-
97
-
98
- def map_standard_dtype_to_pandas_dtype (dtype : Any ) -> Any :
84
+ class Date :
85
+ ...
86
+
87
+
88
+ class Datetime :
89
+ def __init__ (self , time_unit , time_zone = None ):
90
+ self .time_unit = time_unit
91
+ # todo validate time zone
92
+ self .time_zone = time_zone
93
+
94
+
95
+ class Duration :
96
+ def __init__ (self , time_unit ):
97
+ self .time_unit = time_unit
98
+
99
+
100
+ def map_pandas_dtype_to_standard_dtype (dtype : Any ) -> DType :
101
+ if dtype == "int64" :
102
+ return Int64 ()
103
+ if dtype == "Int64" :
104
+ return Int64 ()
105
+ if dtype == "int32" :
106
+ return Int32 ()
107
+ if dtype == "Int32" :
108
+ return Int32 ()
109
+ if dtype == "int16" :
110
+ return Int16 ()
111
+ if dtype == "Int16" :
112
+ return Int16 ()
113
+ if dtype == "int8" :
114
+ return Int8 ()
115
+ if dtype == "Int8" :
116
+ return Int8 ()
117
+ if dtype == "uint64" :
118
+ return UInt64 ()
119
+ if dtype == "UInt64" :
120
+ return UInt64 ()
121
+ if dtype == "uint32" :
122
+ return UInt32 ()
123
+ if dtype == "UInt32" :
124
+ return UInt32 ()
125
+ if dtype == "uint16" :
126
+ return UInt16 ()
127
+ if dtype == "UInt16" :
128
+ return UInt16 ()
129
+ if dtype == "uint8" :
130
+ return UInt8 ()
131
+ if dtype == "UInt8" :
132
+ return UInt8 ()
133
+ if dtype == "float64" :
134
+ return Float64 ()
135
+ if dtype == "Float64" :
136
+ return Float64 ()
137
+ if dtype == "float32" :
138
+ return Float32 ()
139
+ if dtype == "Float32" :
140
+ return Float32 ()
141
+ if dtype == "bool" :
142
+ # 'boolean' not yet covered, as the default dtype in pandas is still 'bool'
143
+ return Bool ()
144
+ if dtype == "object" :
145
+ return String ()
146
+ if dtype == "string" :
147
+ return String ()
148
+ if dtype == "datetime64[s]" :
149
+ return Date ()
150
+ if dtype .startswith ("datetime64[" ):
151
+ time_unit = re .search (r"datetime64\[(\w{1,2})" , dtype ).group (1 )
152
+ return Datetime (time_unit )
153
+ if dtype .startswith ("timedelta64[" ):
154
+ time_unit = re .search (r"timedelta64\[(\w{1,2})" , dtype ).group (1 )
155
+ return Duration (time_unit )
156
+ raise AssertionError (f"Unsupported dtype! { dtype } " )
157
+
158
+
159
+ def map_standard_dtype_to_pandas_dtype (dtype : DType ) -> Any :
99
160
if isinstance (dtype , Int64 ):
100
161
return "int64"
101
162
if isinstance (dtype , Int32 ):
@@ -120,9 +181,26 @@ def map_standard_dtype_to_pandas_dtype(dtype: Any) -> Any:
120
181
return "bool"
121
182
if isinstance (dtype , String ):
122
183
return "object"
184
+ if isinstance (dtype , Datetime ):
185
+ if dtype .time_zone is not None : # pragma: no cover (todo)
186
+ return f"datetime64[{ dtype .time_unit } , { dtype .time_zone } ]"
187
+ return f"datetime64[{ dtype .time_unit } ]"
188
+ if isinstance (dtype , Duration ):
189
+ return f"timedelta64[{ dtype .time_unit } ]"
123
190
raise AssertionError (f"Unknown dtype: { dtype } " )
124
191
125
192
193
+ def convert_to_standard_compliant_column (
194
+ ser : pd .Series , api_version : str | None = None
195
+ ) -> PandasDataFrame :
196
+ if api_version is None : # pragma: no cover
197
+ api_version = LATEST_API_VERSION
198
+ if ser .name is not None and not isinstance (ser .name , str ):
199
+ raise ValueError (f"Expected column with string name, got: { ser .name } " )
200
+ name = ser .name or ""
201
+ return PandasPermissiveColumn (ser .rename (name ), api_version = api_version )
202
+
203
+
126
204
def convert_to_standard_compliant_dataframe (
127
205
df : pd .DataFrame , api_version : str | None = None
128
206
) -> PandasDataFrame :
@@ -131,13 +209,6 @@ def convert_to_standard_compliant_dataframe(
131
209
return PandasDataFrame (df , api_version = api_version )
132
210
133
211
134
- def convert_to_standard_compliant_column (
135
- df : pd .Series [Any ],
136
- api_version : str | None = None ,
137
- ) -> PandasColumn [Any ]:
138
- return PandasColumn (df , api_version = api_version or LATEST_API_VERSION )
139
-
140
-
141
212
def concat (dataframes : Sequence [PandasDataFrame ]) -> PandasDataFrame :
142
213
dtypes = dataframes [0 ].dataframe .dtypes
143
214
dfs = []
@@ -164,16 +235,30 @@ def concat(dataframes: Sequence[PandasDataFrame]) -> PandasDataFrame:
164
235
165
236
def column_from_sequence (
166
237
sequence : Sequence [Any ], * , dtype : Any , name : str , api_version : str | None = None
167
- ) -> PandasColumn [Any ]:
238
+ ) -> PandasPermissiveColumn [Any ]:
168
239
ser = pd .Series (sequence , dtype = map_standard_dtype_to_pandas_dtype (dtype ), name = name )
169
- return PandasColumn (ser , api_version = LATEST_API_VERSION )
240
+ return PandasPermissiveColumn (ser , api_version = api_version or LATEST_API_VERSION )
241
+
242
+
243
+ def dataframe_from_dict (
244
+ data : dict [str , PandasPermissiveColumn [Any ]], api_version : str | None = None
245
+ ) -> PandasDataFrame :
246
+ for _ , col in data .items ():
247
+ if not isinstance (col , PandasPermissiveColumn ): # pragma: no cover
248
+ raise TypeError (f"Expected PandasPermissiveColumn, got { type (col )} " )
249
+ return PandasDataFrame (
250
+ pd .DataFrame (
251
+ {label : column .column .rename (label ) for label , column in data .items ()}
252
+ ),
253
+ api_version = api_version or LATEST_API_VERSION ,
254
+ )
170
255
171
256
172
257
def column_from_1d_array (
173
258
data : Any , * , dtype : Any , name : str | None = None , api_version : str | None = None
174
- ) -> PandasColumn [Any ]: # pragma: no cover
259
+ ) -> PandasPermissiveColumn [Any ]: # pragma: no cover
175
260
ser = pd .Series (data , dtype = map_standard_dtype_to_pandas_dtype (dtype ), name = name )
176
- return PandasColumn (ser , api_version = api_version or LATEST_API_VERSION )
261
+ return PandasPermissiveColumn (ser , api_version = api_version or LATEST_API_VERSION )
177
262
178
263
179
264
def dataframe_from_2d_array (
@@ -189,20 +274,6 @@ def dataframe_from_2d_array(
189
274
return PandasDataFrame (df , api_version = api_version or LATEST_API_VERSION )
190
275
191
276
192
- def dataframe_from_dict (
193
- data : dict [str , PandasColumn [Any ]], api_version : str | None = None
194
- ) -> PandasDataFrame :
195
- for _ , col in data .items ():
196
- if not isinstance (col , PandasColumn ): # pragma: no cover
197
- raise TypeError (f"Expected PandasColumn, got { type (col )} " )
198
- return PandasDataFrame (
199
- pd .DataFrame (
200
- {label : column .column .rename (label ) for label , column in data .items ()}
201
- ),
202
- api_version = api_version or LATEST_API_VERSION ,
203
- )
204
-
205
-
206
277
def is_null (value : Any ) -> bool :
207
278
return value is null
208
279
@@ -223,3 +294,47 @@ def is_dtype(dtype: Any, kind: str | tuple[str, ...]) -> bool:
223
294
if _kind == "string" :
224
295
dtypes .add (String )
225
296
return isinstance (dtype , tuple (dtypes ))
297
+
298
+
299
+ def any_rowwise (* columns : str , skip_nulls : bool = True ) -> PandasColumn :
300
+ # todo: accept expressions
301
+ def func (df ):
302
+ return df .loc [:, list (columns ) or df .columns .tolist ()].any (axis = 1 )
303
+
304
+ return PandasColumn (root_names = list (columns ), output_name = "any" , base_call = func )
305
+
306
+
307
+ def all_rowwise (* columns : str , skip_nulls : bool = True ) -> PandasColumn :
308
+ def func (df : pd .DataFrame ) -> pd .Series :
309
+ return df .loc [:, list (columns ) or df .columns .tolist ()].all (axis = 1 )
310
+
311
+ return PandasColumn (root_names = list (columns ), output_name = "all" , base_call = func )
312
+
313
+
314
+ def sorted_indices (
315
+ * keys : str ,
316
+ ascending : Sequence [bool ] | bool = True ,
317
+ nulls_position : Literal ["first" , "last" ] = "last" ,
318
+ ) -> Column :
319
+ def func (df : pd .DataFrame ) -> pd .Series :
320
+ if ascending :
321
+ return (
322
+ df .loc [:, list (keys )]
323
+ .sort_values (list (keys ))
324
+ .index .to_series ()
325
+ .reset_index (drop = True )
326
+ )
327
+ return (
328
+ df .loc [:, list (keys )]
329
+ .sort_values (list (keys ))
330
+ .index .to_series ()[::- 1 ]
331
+ .reset_index (drop = True )
332
+ )
333
+
334
+ return PandasColumn (root_names = list (keys ), output_name = "indices" , base_call = func )
335
+
336
+
337
+ def unique_indices (
338
+ keys : str | list [str ] | None = None , * , skip_nulls : bool = True
339
+ ) -> Column :
340
+ raise NotImplementedError ("namespace.unique_indices not implemented for pandas yet" )
0 commit comments