@@ -39,56 +39,74 @@ def df():
3939 )
4040 return ctx .create_dataframe ([[batch ]])
4141
42+
4243@pytest .fixture
4344def df_aggregate_100 ():
4445 ctx = SessionContext ()
4546 ctx .register_csv ("aggregate_test_data" , "./testing/data/csv/aggregate_test_100.csv" )
4647 return ctx .table ("aggregate_test_data" )
4748
4849
49- @pytest .mark .parametrize ("agg_expr, calc_expected" , [
50- (f .avg (column ("a" )), lambda a , b , c , d : np .array (np .average (a ))),
51- (f .corr (column ("a" ), column ("b" )), lambda a , b , c , d : np .array (np .corrcoef (a , b )[0 ][1 ])),
52- (f .count (column ("a" )), lambda a , b , c , d : pa .array ([len (a )])),
53- # Sample (co)variance -> ddof=1
54- # Population (co)variance -> ddof=0
55- (f .covar (column ("a" ), column ("b" )), lambda a , b , c , d : np .array (np .cov (a , b , ddof = 1 )[0 ][1 ])),
56- (f .covar_pop (column ("a" ), column ("c" )), lambda a , b , c , d : np .array (np .cov (a , c , ddof = 0 )[0 ][1 ])),
57- (f .covar_samp (column ("b" ), column ("c" )), lambda a , b , c , d : np .array (np .cov (b , c , ddof = 1 )[0 ][1 ])),
58- # f.grouping(col_a), # No physical plan implemented yet
59- (f .max (column ("a" )), lambda a , b , c , d : np .array (np .max (a ))),
60- (f .mean (column ("b" )), lambda a , b , c , d : np .array (np .mean (b ))),
61- (f .median (column ("b" )), lambda a , b , c , d : np .array (np .median (b ))),
62- (f .min (column ("a" )), lambda a , b , c , d : np .array (np .min (a ))),
63- (f .sum (column ("b" )), lambda a , b , c , d : np .array (np .sum (b .to_pylist ()))),
64- # Sample stdev -> ddof=1
65- # Population stdev -> ddof=0
66- (f .stddev (column ("a" )), lambda a , b , c , d : np .array (np .std (a , ddof = 1 ))),
67- (f .stddev_pop (column ("b" )), lambda a , b , c , d : np .array (np .std (b , ddof = 0 ))),
68- (f .stddev_samp (column ("c" )), lambda a , b , c , d : np .array (np .std (c , ddof = 1 ))),
69- (f .var (column ("a" )), lambda a , b , c , d : np .array (np .var (a , ddof = 1 ))),
70- (f .var_pop (column ("b" )), lambda a , b , c , d : np .array (np .var (b , ddof = 0 ))),
71- (f .var_samp (column ("c" )), lambda a , b , c , d : np .array (np .var (c , ddof = 1 ))),
72- ])
50+ @pytest .mark .parametrize (
51+ "agg_expr, calc_expected" ,
52+ [
53+ (f .avg (column ("a" )), lambda a , b , c , d : np .array (np .average (a ))),
54+ (
55+ f .corr (column ("a" ), column ("b" )),
56+ lambda a , b , c , d : np .array (np .corrcoef (a , b )[0 ][1 ]),
57+ ),
58+ (f .count (column ("a" )), lambda a , b , c , d : pa .array ([len (a )])),
59+ # Sample (co)variance -> ddof=1
60+ # Population (co)variance -> ddof=0
61+ (
62+ f .covar (column ("a" ), column ("b" )),
63+ lambda a , b , c , d : np .array (np .cov (a , b , ddof = 1 )[0 ][1 ]),
64+ ),
65+ (
66+ f .covar_pop (column ("a" ), column ("c" )),
67+ lambda a , b , c , d : np .array (np .cov (a , c , ddof = 0 )[0 ][1 ]),
68+ ),
69+ (
70+ f .covar_samp (column ("b" ), column ("c" )),
71+ lambda a , b , c , d : np .array (np .cov (b , c , ddof = 1 )[0 ][1 ]),
72+ ),
73+ # f.grouping(col_a), # No physical plan implemented yet
74+ (f .max (column ("a" )), lambda a , b , c , d : np .array (np .max (a ))),
75+ (f .mean (column ("b" )), lambda a , b , c , d : np .array (np .mean (b ))),
76+ (f .median (column ("b" )), lambda a , b , c , d : np .array (np .median (b ))),
77+ (f .min (column ("a" )), lambda a , b , c , d : np .array (np .min (a ))),
78+ (f .sum (column ("b" )), lambda a , b , c , d : np .array (np .sum (b .to_pylist ()))),
79+ # Sample stdev -> ddof=1
80+ # Population stdev -> ddof=0
81+ (f .stddev (column ("a" )), lambda a , b , c , d : np .array (np .std (a , ddof = 1 ))),
82+ (f .stddev_pop (column ("b" )), lambda a , b , c , d : np .array (np .std (b , ddof = 0 ))),
83+ (f .stddev_samp (column ("c" )), lambda a , b , c , d : np .array (np .std (c , ddof = 1 ))),
84+ (f .var (column ("a" )), lambda a , b , c , d : np .array (np .var (a , ddof = 1 ))),
85+ (f .var_pop (column ("b" )), lambda a , b , c , d : np .array (np .var (b , ddof = 0 ))),
86+ (f .var_samp (column ("c" )), lambda a , b , c , d : np .array (np .var (c , ddof = 1 ))),
87+ ],
88+ )
7389def test_aggregation_stats (df , agg_expr , calc_expected ):
74-
7590 agg_df = df .aggregate ([], [agg_expr ])
7691 result = agg_df .collect ()[0 ]
7792 values_a , values_b , values_c , values_d = df .collect ()[0 ]
7893 expected = calc_expected (values_a , values_b , values_c , values_d )
7994 np .testing .assert_array_almost_equal (result .column (0 ), expected )
8095
8196
82- @pytest .mark .parametrize ("agg_expr, expected" , [
83- (f .approx_distinct (column ("b" )), pa .array ([2 ], type = pa .uint64 ())),
84- (f .approx_median (column ("b" )), pa .array ([4 ])),
85- (f .approx_percentile_cont (column ("b" ), lit (0.5 )), pa .array ([4 ])),
86- (
87- f .approx_percentile_cont_with_weight (column ("b" ), lit (0.6 ), lit (0.5 )),
88- pa .array ([6 ], type = pa .float64 ())
89- ),
90- (f .array_agg (column ("b" )), pa .array ([[4 , 4 , 6 ]])),
91- ])
97+ @pytest .mark .parametrize (
98+ "agg_expr, expected" ,
99+ [
100+ (f .approx_distinct (column ("b" )), pa .array ([2 ], type = pa .uint64 ())),
101+ (f .approx_median (column ("b" )), pa .array ([4 ])),
102+ (f .approx_percentile_cont (column ("b" ), lit (0.5 )), pa .array ([4 ])),
103+ (
104+ f .approx_percentile_cont_with_weight (column ("b" ), lit (0.6 ), lit (0.5 )),
105+ pa .array ([6 ], type = pa .float64 ()),
106+ ),
107+ (f .array_agg (column ("b" )), pa .array ([[4 , 4 , 6 ]])),
108+ ],
109+ )
92110def test_aggregation (df , agg_expr , expected ):
93111 agg_df = df .aggregate ([], [agg_expr ])
94112 result = agg_df .collect ()[0 ]
@@ -98,20 +116,21 @@ def test_aggregation(df, agg_expr, expected):
98116def test_aggregate_100 (df_aggregate_100 ):
99117 # https://github.com/apache/datafusion/blob/bddb6415a50746d2803dd908d19c3758952d74f9/datafusion/sqllogictest/test_files/aggregate.slt#L1490-L1498
100118
101- result = df_aggregate_100 . aggregate (
102- [
103- column ("c1" )
104- ],
105- [
106- f . approx_percentile_cont (column ("c3" ), lit ( 0.95 ), lit ( 200 )). alias ( "c3" )
107- ]
108- ). sort ( column ( "c1" ). sort ( ascending = True )). collect ()
119+ result = (
120+ df_aggregate_100 . aggregate (
121+ [ column ("c1" )],
122+ [ f . approx_percentile_cont ( column ( "c3" ), lit ( 0.95 ), lit ( 200 )). alias ( "c3" ) ],
123+ )
124+ . sort (column ("c1" ). sort ( ascending = True ) )
125+ . collect ()
126+ )
109127
110128 assert len (result ) == 1
111129 result = result [0 ]
112130 assert result .column ("c1" ) == pa .array (["a" , "b" , "c" , "d" , "e" ])
113131 assert result .column ("c3" ) == pa .array ([73 , 68 , 122 , 124 , 115 ])
114132
133+
115134def test_bit_add_or_xor (df ):
116135 df = df .aggregate (
117136 [],
0 commit comments