1414method_blacklist = {
1515 'object' : {'median' , 'prod' , 'sem' , 'cumsum' , 'sum' , 'cummin' , 'mean' ,
1616 'max' , 'skew' , 'cumprod' , 'cummax' , 'rank' , 'pct_change' , 'min' ,
17- 'var' , 'mad' , 'describe' , 'std' }
17+ 'var' , 'mad' , 'describe' , 'std' },
18+ 'datetime' : {'median' , 'prod' , 'sem' , 'cumsum' , 'sum' , 'mean' , 'skew' ,
19+ 'cumprod' , 'cummax' , 'pct_change' , 'var' , 'mad' , 'describe' ,
20+ 'std' }
1821}
1922
2023
@@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
9093 self .ser .groupby (self .ser ).groups
9194
9295
93- class FirstLast (object ):
94-
95- goal_time = 0.2
96-
97- param_names = ['dtype' ]
98- params = ['float32' , 'float64' , 'datetime' , 'object' ]
99-
100- def setup (self , dtype ):
101- N = 10 ** 5
102- # with datetimes (GH7555)
103- if dtype == 'datetime' :
104- self .df = DataFrame ({'values' : date_range ('1/1/2011' ,
105- periods = N ,
106- freq = 's' ),
107- 'key' : range (N )})
108- elif dtype == 'object' :
109- self .df = DataFrame ({'values' : ['foo' ] * N ,
110- 'key' : range (N )})
111- else :
112- labels = np .arange (N / 10 ).repeat (10 )
113- data = Series (np .random .randn (len (labels )), dtype = dtype )
114- data [::3 ] = np .nan
115- data [1 ::3 ] = np .nan
116- labels = labels .take (np .random .permutation (len (labels )))
117- self .df = DataFrame ({'values' : data , 'key' : labels })
118-
119- def time_groupby_first (self , dtype ):
120- self .df .groupby ('key' ).first ()
121-
122- def time_groupby_last (self , dtype ):
123- self .df .groupby ('key' ).last ()
124-
125- def time_groupby_nth_all (self , dtype ):
126- self .df .groupby ('key' ).nth (0 , dropna = 'all' )
127-
128- def time_groupby_nth_none (self , dtype ):
129- self .df .groupby ('key' ).nth (0 )
130-
131-
13296class GroupManyLabels (object ):
13397
13498 goal_time = 0.2
@@ -149,39 +113,40 @@ class Nth(object):
149113
150114 goal_time = 0.2
151115
152- def setup_cache (self ):
153- df = DataFrame (np .random .randint (1 , 100 , (10000 , 2 )))
154- df .iloc [1 , 1 ] = np .nan
155- return df
156-
157- def time_frame_nth_any (self , df ):
158- df .groupby (0 ).nth (0 , dropna = 'any' )
159-
160- def time_frame_nth (self , df ):
161- df .groupby (0 ).nth (0 )
162-
116+ param_names = ['dtype' ]
117+ params = ['float32' , 'float64' , 'datetime' , 'object' ]
163118
164- def time_series_nth_any (self , df ):
165- df [1 ].groupby (df [0 ]).nth (0 , dropna = 'any' )
119+ def setup (self , dtype ):
120+ N = 10 ** 5
121+ # with datetimes (GH7555)
122+ if dtype == 'datetime' :
123+ values = date_range ('1/1/2011' , periods = N , freq = 's' )
124+ elif dtype == 'object' :
125+ values = ['foo' ] * N
126+ else :
127+ values = np .arange (N ).astype (dtype )
166128
167- def time_series_nth (self , df ):
168- df [1 ].groupby (df [0 ]).nth (0 )
129+ key = np .arange (N )
130+ self .df = DataFrame ({'key' : key , 'values' : values })
131+ self .df .iloc [1 , 1 ] = np .nan # insert missing data
169132
133+ def time_frame_nth_any (self , dtype ):
134+ self .df .groupby ('key' ).nth (0 , dropna = 'any' )
170135
171- class NthObject (object ):
136+ def time_groupby_nth_all (self , dtype ):
137+ self .df .groupby ('key' ).nth (0 , dropna = 'all' )
172138
173- goal_time = 0.2
139+ def time_frame_nth (self , dtype ):
140+ self .df .groupby ('key' ).nth (0 )
174141
175- def setup_cache (self ):
176- df = DataFrame (np .random .randint (1 , 100 , (10000 ,)), columns = ['g' ])
177- df ['obj' ] = ['a' ] * 5000 + ['b' ] * 5000
178- return df
142+ def time_series_nth_any (self , dtype ):
143+ self .df ['values' ].groupby (self .df ['key' ]).nth (0 , dropna = 'any' )
179144
180- def time_nth (self , df ):
181- df .groupby ('g' ).nth (5 )
145+ def time_groupby_nth_all (self , dtype ):
146+ self . df [ 'values' ] .groupby (self . df [ 'key' ] ).nth (0 , dropna = 'all' )
182147
183- def time_nth_last (self , df ):
184- df .groupby ('g' ). last ( )
148+ def time_series_nth (self , dtype ):
149+ self . df [ 'values' ] .groupby (self . df [ 'key' ]). nth ( 0 )
185150
186151
187152class DateAttributes (object ):
@@ -243,7 +208,7 @@ def time_multi_count(self, df):
243208 df .groupby (['key1' , 'key2' ]).count ()
244209
245210
246- class CountInt (object ):
211+ class CountMultiInt (object ):
247212
248213 goal_time = 0.2
249214
@@ -255,18 +220,18 @@ def setup_cache(self):
255220 'ints2' : np .random .randint (0 , 1000 , size = n )})
256221 return df
257222
258- def time_int_count (self , df ):
223+ def time_multi_int_count (self , df ):
259224 df .groupby (['key1' , 'key2' ]).count ()
260225
261- def time_int_nunique (self , df ):
226+ def time_multi_int_nunique (self , df ):
262227 df .groupby (['key1' , 'key2' ]).nunique ()
263228
264229
265230class AggFunctions (object ):
266231
267232 goal_time = 0.2
268233
269- def setup_cache (self ):
234+ def setup_cache ():
270235 N = 10 ** 5
271236 fac1 = np .array (['A' , 'B' , 'C' ], dtype = 'O' )
272237 fac2 = np .array (['one' , 'two' ], dtype = 'O' )
@@ -361,9 +326,6 @@ def setup(self):
361326 def time_multi_size (self ):
362327 self .df .groupby (['key1' , 'key2' ]).size ()
363328
364- def time_dt_size (self ):
365- self .df .groupby (['dates' ]).size ()
366-
367329 def time_dt_timegrouper_size (self ):
368330 with warnings .catch_warnings (record = True ):
369331 self .df .groupby (TimeGrouper (key = 'dates' , freq = 'M' )).size ()
@@ -376,15 +338,16 @@ class GroupByMethods(object):
376338
377339 goal_time = 0.2
378340
379- param_names = ['dtype' , 'method' ]
380- params = [['int' , 'float' , 'object' ],
341+ param_names = ['dtype' , 'method' , 'application' ]
342+ params = [['int' , 'float' , 'object' , 'datetime' ],
381343 ['all' , 'any' , 'bfill' , 'count' , 'cumcount' , 'cummax' , 'cummin' ,
382344 'cumprod' , 'cumsum' , 'describe' , 'ffill' , 'first' , 'head' ,
383345 'last' , 'mad' , 'max' , 'min' , 'median' , 'mean' , 'nunique' ,
384346 'pct_change' , 'prod' , 'rank' , 'sem' , 'shift' , 'size' , 'skew' ,
385- 'std' , 'sum' , 'tail' , 'unique' , 'value_counts' , 'var' ]]
347+ 'std' , 'sum' , 'tail' , 'unique' , 'value_counts' , 'var' ],
348+ ['direct' , 'transformation' ]]
386349
387- def setup (self , dtype , method ):
350+ def setup (self , dtype , method , application ):
388351 if method in method_blacklist .get (dtype , {}):
389352 raise NotImplementedError # skip benchmark
390353 ngroups = 1000
@@ -398,12 +361,28 @@ def setup(self, dtype, method):
398361 np .random .random (ngroups ) * 10.0 ])
399362 elif dtype == 'object' :
400363 key = ['foo' ] * size
364+ elif dtype == 'datetime' :
365+ key = date_range ('1/1/2011' , periods = size , freq = 's' )
401366
402367 df = DataFrame ({'values' : values , 'key' : key })
403- self .df_groupby_method = getattr (df .groupby ('key' )['values' ], method )
404368
405- def time_method (self , dtype , method ):
406- self .df_groupby_method ()
369+ if application == 'transform' :
370+ if method == 'describe' :
371+ raise NotImplementedError
372+
373+ self .as_group_method = lambda : df .groupby (
374+ 'key' )['values' ].transform (method )
375+ self .as_field_method = lambda : df .groupby (
376+ 'values' )['key' ].transform (method )
377+ else :
378+ self .as_group_method = getattr (df .groupby ('key' )['values' ], method )
379+ self .as_field_method = getattr (df .groupby ('values' )['key' ], method )
380+
381+ def time_dtype_as_group (self , dtype , method , application ):
382+ self .as_group_method ()
383+
384+ def time_dtype_as_field (self , dtype , method , application ):
385+ self .as_field_method ()
407386
408387
409388class Float32 (object ):
0 commit comments