datafusion/sqllogictest/test_files/aggregate.slt

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at

#   http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#######
# Setup test data table
#######
statement ok
CREATE EXTERNAL TABLE aggregate_test_100 (
  c1  VARCHAR NOT NULL,
  c2  TINYINT NOT NULL,
  c3  SMALLINT NOT NULL,
  c4  SMALLINT,
  c5  INT,
  c6  BIGINT NOT NULL,
  c7  SMALLINT NOT NULL,
  c8  INT NOT NULL,
  c9  INT UNSIGNED NOT NULL,
  c10 BIGINT UNSIGNED NOT NULL,
  c11 FLOAT NOT NULL,
  c12 DOUBLE NOT NULL,
  c13 VARCHAR NOT NULL
)
STORED AS CSV
WITH HEADER ROW
LOCATION '../../testing/data/csv/aggregate_test_100.csv'

statement ok
CREATE TABLE d_table (c1 decimal(10,3), c2 varchar)
as values
(110.000, 'A'), (110.001, 'A'), (110.002, 'A'), (110.003, 'A'), (110.004, 'A'), (110.005, 'A'), (110.006, 'A'), (110.007, 'A'), (110.008, 'A'), (110.009, 'A'),
(-100.000, 'B'),(-100.001, 'B'),(-100.002, 'B'),(-100.003, 'B'),(-100.004, 'B'),(-100.005, 'B'),(-100.006, 'B'),(-100.007, 'B'),(-100.008, 'B'),(-100.009, 'B')

statement ok
CREATE TABLE median_table (
    col_i8 TINYINT,
    col_i16 SMALLINT,
    col_i32 INT,
    col_i64 BIGINT,
    col_u8 TINYINT UNSIGNED,
    col_u16 SMALLINT UNSIGNED,
    col_u32 INT UNSIGNED,
    col_u64 BIGINT UNSIGNED,
    col_f32 FLOAT,
    col_f64 DOUBLE,
    col_f64_nan DOUBLE
) as VALUES
( -128, -32768, -2147483648, arrow_cast(-9223372036854775808,'Int64'), 0, 0, 0, arrow_cast(0,'UInt64'), 1.1, 1.1, 1.1 ),
( -128, -32768, -2147483648, arrow_cast(-9223372036854775808,'Int64'), 0, 0, 0, arrow_cast(0,'UInt64'), 4.4, 4.4, arrow_cast('NAN','Float64') ),
( 100,  100,    100,         arrow_cast(100,'Int64'),              100,100,100, arrow_cast(100,'UInt64'), 3.3, 3.3, arrow_cast('NAN','Float64') ),
( 127, 32767, 2147483647, arrow_cast(9223372036854775807,'Int64'), 255, 65535,  4294967295, 18446744073709551615, 2.2, 2.2, arrow_cast('NAN','Float64') )

statement ok
CREATE TABLE test (c1 BIGINT,c2 BIGINT) as values
(0,null), (1,1), (null,1), (3,2), (3,2)

#######
# Error tests
#######

# https://github.com/apache/arrow-datafusion/issues/3353
statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name "APPROX_DISTINCT\(aggregate_test_100\.c9\)"
SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100

# csv_query_approx_percentile_cont_with_weight
statement error DataFusion error: Error during planning: No function matches the given name and argument types 'APPROX_PERCENTILE_CONT_WITH_WEIGHT\(Utf8, Int8, Float64\)'. You might need to add explicit type casts.
SELECT approx_percentile_cont_with_weight(c1, c2, 0.95) FROM aggregate_test_100

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'APPROX_PERCENTILE_CONT_WITH_WEIGHT\(Int16, Utf8, Float64\)'\. You might need to add explicit type casts\.
SELECT approx_percentile_cont_with_weight(c3, c1, 0.95) FROM aggregate_test_100

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'APPROX_PERCENTILE_CONT_WITH_WEIGHT\(Int16, Int8, Utf8\)'\. You might need to add explicit type casts\.
SELECT approx_percentile_cont_with_weight(c3, c2, c1) FROM aggregate_test_100

# csv_query_approx_percentile_cont_with_histogram_bins
statement error This feature is not implemented: Tdigest max_size value for 'APPROX_PERCENTILE_CONT' must be UInt > 0 literal \(got data type Int64\).
SELECT c1, approx_percentile_cont(c3, 0.95, -1000) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'APPROX_PERCENTILE_CONT\(Int16, Float64, Utf8\)'\. You might need to add explicit type casts\.
SELECT approx_percentile_cont(c3, 0.95, c1) FROM aggregate_test_100

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'APPROX_PERCENTILE_CONT\(Int16, Float64, Float64\)'\. You might need to add explicit type casts\.
SELECT approx_percentile_cont(c3, 0.95, 111.1) FROM aggregate_test_100

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'APPROX_PERCENTILE_CONT\(Float64, Float64, Float64\)'\. You might need to add explicit type casts\.
SELECT approx_percentile_cont(c12, 0.95, 111.1) FROM aggregate_test_100

# array agg can use order by
query ?
SELECT array_agg(c13 ORDER BY c13)
FROM
  (SELECT *
  FROM aggregate_test_100
  ORDER BY c13
  LIMIT 5) as t1
----
[0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB, 0og6hSkhbX8AC1ktFS4kounvTzy8Vo, 1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO, 2T3wSlHdEmASmO0xcXHnndkKEt6bz8]

statement ok
CREATE EXTERNAL TABLE agg_order (
c1 INT NOT NULL,
c2 INT NOT NULL,
c3 INT NOT NULL
)
STORED AS CSV
WITH HEADER ROW
LOCATION '../core/tests/data/aggregate_agg_multi_order.csv';

# test array_agg with order by multiple columns
query ?
select array_agg(c1 order by c2 desc, c3) from agg_order;
----
[5, 6, 7, 8, 9, 1, 2, 3, 4, 10]

query TT
explain select array_agg(c1 order by c2 desc, c3) from agg_order;
----
logical_plan
Aggregate: groupBy=[[]], aggr=[[ARRAY_AGG(agg_order.c1) ORDER BY [agg_order.c2 DESC NULLS FIRST, agg_order.c3 ASC NULLS LAST]]]
--TableScan: agg_order projection=[c1, c2, c3]
physical_plan
AggregateExec: mode=Final, gby=[], aggr=[ARRAY_AGG(agg_order.c1)]
--CoalescePartitionsExec
----AggregateExec: mode=Partial, gby=[], aggr=[ARRAY_AGG(agg_order.c1)]
------SortExec: expr=[c2@1 DESC,c3@2 ASC NULLS LAST]
--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_agg_multi_order.csv]]}, projection=[c1, c2, c3], has_header=true

statement error This feature is not implemented: LIMIT not supported in ARRAY_AGG: 1
SELECT array_agg(c13 LIMIT 1) FROM aggregate_test_100


# FIX: custom absolute values
# csv_query_avg_multi_batch

# csv_query_avg
query R
SELECT avg(c12) FROM aggregate_test_100
----
0.508972509913

# csv_query_bit_and
query IIIII
SELECT bit_and(c5), bit_and(c6), bit_and(c7), bit_and(c8), bit_and(c9) FROM aggregate_test_100
----
0 0 0 0 0

# csv_query_bit_and_distinct
query IIIII
SELECT bit_and(distinct c5), bit_and(distinct c6), bit_and(distinct c7), bit_and(distinct c8), bit_and(distinct c9) FROM aggregate_test_100
----
0 0 0 0 0

# csv_query_bit_or
query IIIII
SELECT bit_or(c5), bit_or(c6), bit_or(c7), bit_or(c8), bit_or(c9) FROM aggregate_test_100
----
-1 -1 255 65535 4294967295

# csv_query_bit_or_distinct
query IIIII
SELECT bit_or(distinct c5), bit_or(distinct c6), bit_or(distinct c7), bit_or(distinct c8), bit_or(distinct c9) FROM aggregate_test_100
----
-1 -1 255 65535 4294967295

# csv_query_bit_xor
query IIIII
SELECT bit_xor(c5), bit_xor(c6), bit_xor(c7), bit_xor(c8), bit_xor(c9) FROM aggregate_test_100
----
1632751011 5960911605712039654 148 54789 169634700

# csv_query_bit_xor_distinct (should be different than above)
query IIIII
SELECT bit_xor(distinct c5), bit_xor(distinct c6), bit_xor(distinct c7), bit_xor(distinct c8), bit_xor(distinct c9) FROM aggregate_test_100
----
1632751011 5960911605712039654 196 54789 169634700

# csv_query_bit_xor_distinct_expr
query I
SELECT bit_xor(distinct c5 % 2) FROM aggregate_test_100
----
-2

# csv_query_covariance_1
query R
SELECT covar_pop(c2, c12) FROM aggregate_test_100
----
-0.079169322354

# csv_query_covariance_2
query R
SELECT covar(c2, c12) FROM aggregate_test_100
----
-0.079969012479

# single_row_query_covar_1
query R
select covar_samp(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq
----
NULL

# single_row_query_covar_2
query R
select covar_pop(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq
----
0

# all_nulls_query_covar
query RR
with data as (
  select null::int as f, null::int as b
  union all
  select null::int as f, null::int as b
)
select covar_samp(f, b), covar_pop(f, b)
from data
----
NULL NULL

# covar_query_with_nulls
query RR
with data as (
  select 1 as f,       4 as b
  union all
  select null as f,   99 as b
  union all
  select 2 as f,       5 as b
  union all
  select 98 as f,   null as b
  union all
  select 3 as f,       6 as b
  union all
  select null as f, null as b
)
select covar_samp(f, b), covar_pop(f, b)
from data
----
1 0.666666666667

# csv_query_correlation
query R
SELECT corr(c2, c12) FROM aggregate_test_100
----
-0.190645441906

# single_row_query_correlation
query R
select corr(sq.column1, sq.column2) from (values (1.1, 2.2)) as sq
----
0

# all_nulls_query_correlation
query R
with data as (
  select null::int as f, null::int as b
  union all
  select null::int as f, null::int as b
)
select corr(f, b)
from data
----
NULL

# correlation_query_with_nulls
query R
with data as (
  select 1 as f,       4 as b
  union all
  select null as f,   99 as b
  union all
  select 2 as f,       5 as b
  union all
  select 98 as f,   null as b
  union all
  select 3 as f,       6 as b
  union all
  select null as f, null as b
)
select corr(f, b)
from data
----
1

# csv_query_variance_1
query R
SELECT var_pop(c2) FROM aggregate_test_100
----
1.8675

# csv_query_variance_2
query R
SELECT var_pop(c6) FROM aggregate_test_100
----
26156334342021890000000000000000000000

# csv_query_variance_3
query R
SELECT var_pop(c12) FROM aggregate_test_100
----
0.092342237216

# csv_query_variance_4
query R
SELECT var(c2) FROM aggregate_test_100
----
1.886363636364

# csv_query_variance_5
query R
SELECT var_samp(c2) FROM aggregate_test_100
----
1.886363636364

# csv_query_stddev_1
query R
SELECT stddev_pop(c2) FROM aggregate_test_100
----
1.366565036872

# csv_query_stddev_2
query R
SELECT stddev_pop(c6) FROM aggregate_test_100
----
5114326382039172000

# csv_query_stddev_3
query R
SELECT stddev_pop(c12) FROM aggregate_test_100
----
0.303878655413

# csv_query_stddev_4
query R
SELECT stddev(c12) FROM aggregate_test_100
----
0.305409539941

# csv_query_stddev_5
query R
SELECT stddev_samp(c12) FROM aggregate_test_100
----
0.305409539941

# csv_query_stddev_6
query R
select stddev(sq.column1) from (values (1.1), (2.0), (3.0)) as sq
----
0.950438495292

# csv_query_approx_median_1
query I
SELECT approx_median(c2) FROM aggregate_test_100
----
3

# csv_query_approx_median_2
query I
SELECT approx_median(c6) FROM aggregate_test_100
----
1146409980542786560

# csv_query_approx_median_3
query R
SELECT approx_median(c12) FROM aggregate_test_100
----
0.555006541052

# csv_query_median_1
query I
SELECT median(c2) FROM aggregate_test_100
----
3

# csv_query_median_2
query I
SELECT median(c6) FROM aggregate_test_100
----
1125553990140691277

# csv_query_median_3
query R
SELECT median(c12) FROM aggregate_test_100
----
0.551390054439

# median_i8
query I
SELECT median(col_i8) FROM median_table
----
-14

# median_i16
query I
SELECT median(col_i16) FROM median_table
----
-16334

# median_i32
query I
SELECT median(col_i32) FROM median_table
----
-1073741774

# median_i64
query I
SELECT median(col_i64) FROM median_table
----
-4611686018427387854

# median_u8
query I
SELECT median(col_u8) FROM median_table
----
50

# median_u16
query I
SELECT median(col_u16) FROM median_table
----
50

# median_u32
query I
SELECT median(col_u32) FROM median_table
----
50

# median_u64
query I
SELECT median(col_u64) FROM median_table
----
50

# median_f32
query R
SELECT median(col_f32) FROM median_table
----
2.75

# median_f64
query R
SELECT median(col_f64) FROM median_table
----
2.75

# median_f64_nan
query R
SELECT median(col_f64_nan) FROM median_table
----
NaN

# approx_median_f64_nan
query R
SELECT approx_median(col_f64_nan) FROM median_table
----
NaN

# median_multi
# test case for https://github.com/apache/arrow-datafusion/issues/3105
# has an intermediate grouping
statement ok
create table cpu (host string, usage float) as select * from (values
('host0', 90.1),
('host1', 90.2),
('host1', 90.4)
);

query TR rowsort
select host, median(usage) from cpu group by host;
----
host0 90.1
host1 90.3

statement ok
drop table cpu;

# this test is to show create table as and select into works in the same way
statement ok
SELECT * INTO cpu
FROM (VALUES
    ('host0', 90.1),
    ('host1', 90.2),
    ('host1', 90.4)
  ) AS cpu (host, usage);

query TR rowsort
select host, median(usage) from cpu group by host;
----
host0 90.1
host1 90.3

query R
select median(usage) from cpu;
----
90.2

statement ok
drop table cpu;

# median_multi_odd

# data is not sorted and has an odd number of values per group
statement ok
create table cpu (host string, usage float) as select * from (values
  ('host0', 90.2),
  ('host1', 90.1),
  ('host1', 90.5),
  ('host0', 90.5),
  ('host1', 90.0),
  ('host1', 90.3),
  ('host0', 87.9),
  ('host1', 89.3)
);

query TR rowsort
select host, median(usage) from cpu group by host;
----
host0 90.2
host1 90.1


statement ok
drop table cpu;

# median_multi_even
# data is not sorted and has an odd number of values per group
statement ok
create table cpu (host string, usage float) as select * from (values ('host0', 90.2), ('host1', 90.1), ('host1', 90.5), ('host0', 90.5), ('host1', 90.0), ('host1', 90.3), ('host1', 90.2), ('host1', 90.3));

query TR rowsort
select host, median(usage) from cpu group by host;
----
host0 90.35
host1 90.25

statement ok
drop table cpu

# csv_query_external_table_count
query I
SELECT COUNT(c12) FROM aggregate_test_100
----
100

# csv_query_external_table_sum
query II
SELECT SUM(CAST(c7 AS BIGINT)), SUM(CAST(c8 AS BIGINT)) FROM aggregate_test_100
----
13060 3017641

# csv_query_count
query I
SELECT count(c12) FROM aggregate_test_100
----
100

# csv_query_count_distinct
query I
SELECT count(distinct c2) FROM aggregate_test_100
----
5

# csv_query_count_distinct_expr
query I
SELECT count(distinct c2 % 2) FROM aggregate_test_100
----
2

# csv_query_count_star
query I
SELECT COUNT(*) FROM aggregate_test_100
----
100

# csv_query_count_literal
query I
SELECT COUNT(2) FROM aggregate_test_100
----
100

# csv_query_approx_count
# FIX: https://github.com/apache/arrow-datafusion/issues/3353
# query II
# SELECT approx_distinct(c9) AS count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100
# ----
# 100 99

# csv_query_approx_count_dupe_expr_aliased
query II
SELECT approx_distinct(c9) AS a, approx_distinct(c9) AS b FROM aggregate_test_100
----
100 100

## This test executes the APPROX_PERCENTILE_CONT aggregation against the test
## data, asserting the estimated quantiles are ±5% their actual values.
##
## Actual quantiles calculated with:
##
## ```r
## read_csv("./testing/data/csv/aggregate_test_100.csv") |>
##     select_if(is.numeric) |>
##     summarise_all(~ quantile(., c(0.1, 0.5, 0.9)))
## ```
##
## Giving:
##
## ```text
##      c2    c3      c4           c5       c6    c7     c8          c9     c10   c11    c12
##   <dbl> <dbl>   <dbl>        <dbl>    <dbl> <dbl>  <dbl>       <dbl>   <dbl> <dbl>  <dbl>
## 1     1 -95.3 -22925. -1882606710  -7.25e18  18.9  2671.  472608672. 1.83e18 0.109 0.0714
## 2     3  15.5   4599    377164262   1.13e18 134.  30634  2365817608. 9.30e18 0.491 0.551
## 3     5 102.   25334.  1991374996.  7.37e18 231   57518. 3776538487. 1.61e19 0.834 0.946
## ```
##
## Column `c12` is omitted due to a large relative error (~10%) due to the small
## float values.

#csv_query_approx_percentile_cont (c2)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.1) AS DOUBLE) / 1.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.5) AS DOUBLE) / 3.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c2, 0.9) AS DOUBLE) / 5.0) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_approx_percentile_cont (c3)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.1) AS DOUBLE) / -95.3) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.5) AS DOUBLE) / 15.5) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c3, 0.9) AS DOUBLE) / 102.0) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_approx_percentile_cont (c4)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.1) AS DOUBLE) / -22925.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.5) AS DOUBLE) / 4599.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c4, 0.9) AS DOUBLE) / 25334.0) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_approx_percentile_cont (c5)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.1) AS DOUBLE) / -1882606710.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.5) AS DOUBLE) / 377164262.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c5, 0.9) AS DOUBLE) / 1991374996.0) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_approx_percentile_cont (c6)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.1) AS DOUBLE) / -7250000000000000000) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.5) AS DOUBLE) / 1130000000000000000) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c6, 0.9) AS DOUBLE) / 7370000000000000000) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_approx_percentile_cont (c7)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.1) AS DOUBLE) / 18.9) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.5) AS DOUBLE) / 134.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c7, 0.9) AS DOUBLE) / 231.0) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_approx_percentile_cont (c8)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.1) AS DOUBLE) / 2671.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.5) AS DOUBLE) / 30634.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c8, 0.9) AS DOUBLE) / 57518.0) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_approx_percentile_cont (c9)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.1) AS DOUBLE) / 472608672.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.5) AS DOUBLE) / 2365817608.0) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c9, 0.9) AS DOUBLE) / 3776538487.0) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_approx_percentile_cont (c10)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.1) AS DOUBLE) / 1830000000000000000) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.5) AS DOUBLE) / 9300000000000000000) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c10, 0.9) AS DOUBLE) / 16100000000000000000) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_approx_percentile_cont (c11)
query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.1) AS DOUBLE) /  0.109) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.5) AS DOUBLE) / 0.491) < 0.05) AS q FROM aggregate_test_100
----
true

query B
SELECT (ABS(1 - CAST(approx_percentile_cont(c11, 0.9) AS DOUBLE) / 0.834) < 0.05) AS q FROM aggregate_test_100
----
true

# csv_query_cube_avg
query TIR
SELECT c1, c2, AVG(c3) FROM aggregate_test_100 GROUP BY CUBE (c1, c2) ORDER BY c1, c2
----
a 1 -17.6
a 2 -15.333333333333
a 3 -4.5
a 4 -32
a 5 -32
a NULL -18.333333333333
b 1 31.666666666667
b 2 25.5
b 3 -42
b 4 -44.6
b 5 -0.2
b NULL -5.842105263158
c 1 47.5
c 2 -55.571428571429
c 3 47.5
c 4 -10.75
c 5 12
c NULL -1.333333333333
d 1 -8.142857142857
d 2 109.333333333333
d 3 41.333333333333
d 4 54
d 5 -49.5
d NULL 25.444444444444
e 1 75.666666666667
e 2 37.8
e 3 48
e 4 37.285714285714
e 5 -11
e NULL 40.333333333333
NULL 1 16.681818181818
NULL 2 8.363636363636
NULL 3 20.789473684211
NULL 4 1.260869565217
NULL 5 -13.857142857143
NULL NULL 7.81

# csv_query_rollup_avg
query TIIR
SELECT c1, c2, c3, AVG(c4) FROM aggregate_test_100 WHERE c1 IN ('a', 'b', NULL) GROUP BY ROLLUP (c1, c2, c3) ORDER BY c1, c2, c3
----
a 1 -85 -15154
a 1 -56 8692
a 1 -25 15295
a 1 -5 12636
a 1 83 -14704
a 1 NULL 1353
a 2 -48 -18025
a 2 -43 13080
a 2 45 15673
a 2 NULL 3576
a 3 -72 -11122
a 3 -12 -9168
a 3 13 22338.5
a 3 14 28162
a 3 17 -22796
a 3 NULL 4958.833333333333
a 4 -101 11640
a 4 -54 -2376
a 4 -38 20744
a 4 65 -28462
a 4 NULL 386.5
a 5 -101 -12484
a 5 -31 -12907
a 5 36 -16974
a 5 NULL -14121.666666666666
a NULL NULL 306.047619047619
b 1 12 7652
b 1 29 -18218
b 1 54 -18410
b 1 NULL -9658.666666666666
b 2 -60 -21739
b 2 31 23127
b 2 63 21456
b 2 68 15874
b 2 NULL 9679.5
b 3 -101 -13217
b 3 17 14457
b 3 NULL 620
b 4 -117 19316
b 4 -111 -1967
b 4 -59 25286
b 4 17 -28070
b 4 47 20690
b 4 NULL 7051
b 5 -82 22080
b 5 -44 15788
b 5 -5 24896
b 5 62 16337
b 5 68 21576
b 5 NULL 20135.4
b NULL NULL 7732.315789473684
NULL NULL NULL 3833.525

# csv_query_groupingsets_avg
query TIIR
SELECT c1, c2, c3, AVG(c4)
FROM aggregate_test_100
WHERE c1 IN ('a', 'b', NULL)
GROUP BY GROUPING SETS ((c1), (c1,c2), (c1,c2,c3))
ORDER BY c1, c2, c3
----
a 1 -85 -15154
a 1 -56 8692
a 1 -25 15295
a 1 -5 12636
a 1 83 -14704
a 1 NULL 1353
a 2 -48 -18025
a 2 -43 13080
a 2 45 15673
a 2 NULL 3576
a 3 -72 -11122
a 3 -12 -9168
a 3 13 22338.5
a 3 14 28162
a 3 17 -22796
a 3 NULL 4958.833333333333
a 4 -101 11640
a 4 -54 -2376
a 4 -38 20744
a 4 65 -28462
a 4 NULL 386.5
a 5 -101 -12484
a 5 -31 -12907
a 5 36 -16974
a 5 NULL -14121.666666666666
a NULL NULL 306.047619047619
b 1 12 7652
b 1 29 -18218
b 1 54 -18410
b 1 NULL -9658.666666666666
b 2 -60 -21739
b 2 31 23127
b 2 63 21456
b 2 68 15874
b 2 NULL 9679.5
b 3 -101 -13217
b 3 17 14457
b 3 NULL 620
b 4 -117 19316
b 4 -111 -1967
b 4 -59 25286
b 4 17 -28070
b 4 47 20690
b 4 NULL 7051
b 5 -82 22080
b 5 -44 15788
b 5 -5 24896
b 5 62 16337
b 5 68 21576
b 5 NULL 20135.4
b NULL NULL 7732.315789473684

# csv_query_singlecol_with_rollup_avg
query TIIR
SELECT c1, c2, c3, AVG(c4)
FROM aggregate_test_100
WHERE c1 IN ('a', 'b', NULL)
GROUP BY c1, ROLLUP (c2, c3)
ORDER BY c1, c2, c3
----
a 1 -85 -15154
a 1 -56 8692
a 1 -25 15295
a 1 -5 12636
a 1 83 -14704
a 1 NULL 1353
a 2 -48 -18025
a 2 -43 13080
a 2 45 15673
a 2 NULL 3576
a 3 -72 -11122
a 3 -12 -9168
a 3 13 22338.5
a 3 14 28162
a 3 17 -22796
a 3 NULL 4958.833333333333
a 4 -101 11640
a 4 -54 -2376
a 4 -38 20744
a 4 65 -28462
a 4 NULL 386.5
a 5 -101 -12484
a 5 -31 -12907
a 5 36 -16974
a 5 NULL -14121.666666666666
a NULL NULL 306.047619047619
b 1 12 7652
b 1 29 -18218
b 1 54 -18410
b 1 NULL -9658.666666666666
b 2 -60 -21739
b 2 31 23127
b 2 63 21456
b 2 68 15874
b 2 NULL 9679.5
b 3 -101 -13217
b 3 17 14457
b 3 NULL 620
b 4 -117 19316
b 4 -111 -1967
b 4 -59 25286
b 4 17 -28070
b 4 47 20690
b 4 NULL 7051
b 5 -82 22080
b 5 -44 15788
b 5 -5 24896
b 5 62 16337
b 5 68 21576
b 5 NULL 20135.4
b NULL NULL 7732.315789473684

# csv_query_approx_percentile_cont_with_weight
query TI
SELECT c1, approx_percentile_cont(c3, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 73
b 68
c 122
d 124
e 115

# csv_query_approx_percentile_cont_with_weight (2)
query TI
SELECT c1, approx_percentile_cont_with_weight(c3, 1, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 73
b 68
c 122
d 124
e 115

# csv_query_approx_percentile_cont_with_histogram_bins
query TI
SELECT c1, approx_percentile_cont(c3, 0.95, 200) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 73
b 68
c 122
d 124
e 115

query TI
SELECT c1, approx_percentile_cont_with_weight(c3, c2, 0.95) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 74
b 68
c 123
d 124
e 115

# csv_query_sum_crossjoin
query TTI
SELECT a.c1, b.c1, SUM(a.c2) FROM aggregate_test_100 as a CROSS JOIN aggregate_test_100 as b GROUP BY a.c1, b.c1 ORDER BY a.c1, b.c1
----
a a 1260
a b 1140
a c 1260
a d 1080
a e 1260
b a 1302
b b 1178
b c 1302
b d 1116
b e 1302
c a 1176
c b 1064
c c 1176
c d 1008
c e 1176
d a 924
d b 836
d c 924
d d 792
d e 924
e a 1323
e b 1197
e c 1323
e d 1134
e e 1323

# csv_query_cube_sum_crossjoin
query TTI
SELECT a.c1, b.c1, SUM(a.c2) FROM aggregate_test_100 as a CROSS JOIN aggregate_test_100 as b GROUP BY CUBE (a.c1, b.c1) ORDER BY a.c1, b.c1
----
a    a    1260
a    b    1140
a    c    1260
a    d    1080
a    e    1260
a    NULL 6000
b    a    1302
b    b    1178
b    c    1302
b    d    1116
b    e    1302
b    NULL 6200
c    a    1176
c    b    1064
c    c    1176
c    d    1008
c    e    1176
c    NULL 5600
d    a    924
d    b    836
d    c    924
d    d    792
d    e    924
d    NULL 4400
e    a    1323
e    b    1197
e    c    1323
e    d    1134
e    e    1323
e    NULL 6300
NULL a    5985
NULL b    5415
NULL c    5985
NULL d    5130
NULL e    5985
NULL NULL 28500

# csv_query_cube_distinct_count
query TII
SELECT c1, c2, COUNT(DISTINCT c3) FROM aggregate_test_100 GROUP BY CUBE (c1,c2) ORDER BY c1,c2
----
a    1    5
a    2    3
a    3    5
a    4    4
a    5    3
a    NULL 19
b    1    3
b    2    4
b    3    2
b    4    5
b    5    5
b    NULL 17
c    1    4
c    2    7
c    3    4
c    4    4
c    5    2
c    NULL 21
d    1    7
d    2    3
d    3    3
d    4    3
d    5    2
d    NULL 18
e    1    3
e    2    4
e    3    4
e    4    7
e    5    2
e    NULL 18
NULL 1    22
NULL 2    20
NULL 3    17
NULL 4    23
NULL 5    14
NULL NULL 80

# csv_query_rollup_distinct_count
query TII
SELECT c1, c2, COUNT(DISTINCT c3) FROM aggregate_test_100 GROUP BY ROLLUP (c1,c2) ORDER BY c1,c2
----
a    1    5
a    2    3
a    3    5
a    4    4
a    5    3
a    NULL 19
b    1    3
b    2    4
b    3    2
b    4    5
b    5    5
b    NULL 17
c    1    4
c    2    7
c    3    4
c    4    4
c    5    2
c    NULL 21
d    1    7
d    2    3
d    3    3
d    4    3
d    5    2
d    NULL 18
e    1    3
e    2    4
e    3    4
e    4    7
e    5    2
e    NULL 18
NULL NULL 80

# csv_query_rollup_sum_crossjoin
query TTI
SELECT a.c1, b.c1, SUM(a.c2) FROM aggregate_test_100 as a CROSS JOIN aggregate_test_100 as b GROUP BY ROLLUP (a.c1, b.c1) ORDER BY a.c1, b.c1
----
a    a    1260
a    b    1140
a    c    1260
a    d    1080
a    e    1260
a    NULL 6000
b    a    1302
b    b    1178
b    c    1302
b    d    1116
b    e    1302
b    NULL 6200
c    a    1176
c    b    1064
c    c    1176
c    d    1008
c    e    1176
c    NULL 5600
d    a    924
d    b    836
d    c    924
d    d    792
d    e    924
d    NULL 4400
e    a    1323
e    b    1197
e    c    1323
e    d    1134
e    e    1323
e    NULL 6300
NULL NULL 28500

# query_count_without_from
query I
SELECT count(1 + 1)
----
1

# csv_query_array_agg
query ?
SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 ORDER BY c13 LIMIT 2) test
----
[0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm, 0keZ5G8BffGwgF2RwQD59TFzMStxCB]

# csv_query_array_agg_empty
query ?
SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 LIMIT 0) test
----
[]

# csv_query_array_agg_one
query ?
SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 ORDER BY c13 LIMIT 1) test
----
[0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm]

# csv_query_array_agg_with_overflow
query IIRIII
select c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count(c3) count_c3 from aggregate_test_100 group by c2 order by c2
----
1 367 16.681818181818 125 -99 22
2 184 8.363636363636 122 -117 22
3 395 20.789473684211 123 -101 19
4 29 1.260869565217 123 -117 23
5 -194 -13.857142857143 118 -101 14

# csv_query_array_cube_agg_with_overflow
query TIIRIII
select c1, c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count(c3) count_c3 from aggregate_test_100 group by CUBE (c1,c2) order by c1, c2
----
a 1 -88 -17.6 83 -85 5
a 2 -46 -15.333333333333 45 -48 3
a 3 -27 -4.5 17 -72 6
a 4 -128 -32 65 -101 4
a 5 -96 -32 36 -101 3
a NULL -385 -18.333333333333 83 -101 21
b 1 95 31.666666666667 54 12 3
b 2 102 25.5 68 -60 4
b 3 -84 -42 17 -101 2
b 4 -223 -44.6 47 -117 5
b 5 -1 -0.2 68 -82 5
b NULL -111 -5.842105263158 68 -117 19
c 1 190 47.5 103 -24 4
c 2 -389 -55.571428571429 29 -117 7
c 3 190 47.5 97 -2 4
c 4 -43 -10.75 123 -90 4
c 5 24 12 118 -94 2
c NULL -28 -1.333333333333 123 -117 21
d 1 -57 -8.142857142857 125 -99 7
d 2 328 109.333333333333 122 93 3
d 3 124 41.333333333333 123 -76 3
d 4 162 54 102 5 3
d 5 -99 -49.5 -40 -59 2
d NULL 458 25.444444444444 125 -99 18
e 1 227 75.666666666667 120 36 3
e 2 189 37.8 97 -61 5
e 3 192 48 112 -95 4
e 4 261 37.285714285714 97 -56 7
e 5 -22 -11 64 -86 2
e NULL 847 40.333333333333 120 -95 21
NULL 1 367 16.681818181818 125 -99 22
NULL 2 184 8.363636363636 122 -117 22
NULL 3 395 20.789473684211 123 -101 19
NULL 4 29 1.260869565217 123 -117 23
NULL 5 -194 -13.857142857143 118 -101 14
NULL NULL 781 7.81 125 -117 100

# TODO: array_agg_distinct output is non-determinisitic -- rewrite with array_sort(list_sort)
#       unnest is also not available, so manually unnesting via CROSS JOIN
# additional count(1) forces array_agg_distinct instead of array_agg over aggregated by c2 data
#
# csv_query_array_agg_distinct
query III
WITH indices AS (
  SELECT 1 AS idx UNION ALL
  SELECT 2 AS idx UNION ALL
  SELECT 3 AS idx UNION ALL
  SELECT 4 AS idx UNION ALL
  SELECT 5 AS idx
)
SELECT data.arr[indices.idx] as element, array_length(data.arr) as array_len, dummy
FROM (
  SELECT array_agg(distinct c2) as arr, count(1) as dummy FROM aggregate_test_100
) data
  CROSS JOIN indices
ORDER BY 1
----
1 5 100
2 5 100
3 5 100
4 5 100
5 5 100

# aggregate_time_min_and_max
query TT
select min(t), max(t) from  (select '00:00:00' as t union select '00:00:01' union select '00:00:02')
----
00:00:00 00:00:02

# aggregate_decimal_min
query RT
select min(c1), arrow_typeof(min(c1)) from d_table
----
-100.009 Decimal128(10, 3)

# aggregate_decimal_max
query RT
select max(c1), arrow_typeof(max(c1)) from d_table
----
110.009 Decimal128(10, 3)

# aggregate_decimal_sum
query RT
select sum(c1), arrow_typeof(sum(c1)) from d_table
----
100 Decimal128(20, 3)

# aggregate_decimal_avg
query RT
select avg(c1), arrow_typeof(avg(c1)) from d_table
----
5 Decimal128(14, 7)


# aggregate
query II
SELECT SUM(c1), SUM(c2) FROM test
----
7 6

# aggregate_empty

query II
SELECT SUM(c1), SUM(c2) FROM test where c1 > 100000
----
NULL NULL

# aggregate_avg
query RR
SELECT AVG(c1), AVG(c2) FROM test
----
1.75 1.5

# aggregate_max
query II
SELECT MAX(c1), MAX(c2) FROM test
----
3 2

# aggregate_min
query II
SELECT MIN(c1), MIN(c2) FROM test
----
0 1

# aggregate_grouped
query II
SELECT c1, SUM(c2) FROM test GROUP BY c1 order by c1
----
0 NULL
1 1
3 4
NULL 1

# aggregate_grouped_avg
query IR
SELECT c1, AVG(c2) FROM test GROUP BY c1 order by c1
----
0 NULL
1 1
3 2
NULL 1

# aggregate_grouped_empty
query IR
SELECT c1, AVG(c2) FROM test WHERE c1 = 123 GROUP BY c1
----

# aggregate_grouped_max
query II
SELECT c1, MAX(c2) FROM test GROUP BY c1 order by c1
----
0 NULL
1 1
3 2
NULL 1

# aggregate_grouped_min
query II
SELECT c1, MIN(c2) FROM test GROUP BY c1 order by c1
----
0 NULL
1 1
3 2
NULL 1

# aggregate_min_max_w_custom_window_frames
query RR
SELECT
MIN(c12) OVER (ORDER BY C12 RANGE BETWEEN 0.3 PRECEDING AND 0.2 FOLLOWING) as min1,
MAX(c12) OVER (ORDER BY C11 RANGE BETWEEN 0.1 PRECEDING AND 0.2 FOLLOWING) as max1
FROM aggregate_test_100
ORDER BY C9
LIMIT 5
----
0.014793053078 0.996540038759
0.014793053078 0.980019341044
0.014793053078 0.970671228336
0.266717779508 0.996540038759
0.360076636233 0.970671228336

# aggregate_min_max_with_custom_window_frames_unbounded_start
query RR
SELECT
MIN(c12) OVER (ORDER BY C12 RANGE BETWEEN UNBOUNDED PRECEDING AND 0.2 FOLLOWING) as min1,
MAX(c12) OVER (ORDER BY C11 RANGE BETWEEN UNBOUNDED PRECEDING AND 0.2 FOLLOWING) as max1
FROM aggregate_test_100
ORDER BY C9
LIMIT 5
----
0.014793053078 0.996540038759
0.014793053078 0.980019341044
0.014793053078 0.980019341044
0.014793053078 0.996540038759
0.014793053078 0.980019341044

# aggregate_avg_add
query RRRR
SELECT AVG(c1), AVG(c1) + 1, AVG(c1) + 2, 1 + AVG(c1) FROM test
----
1.75 2.75 3.75 2.75

# case_sensitive_identifiers_aggregates
query I
SELECT max(c1) FROM test;
----
3


# count_basic
query II
SELECT COUNT(c1), COUNT(c2) FROM test
----
4 4

# TODO: count_partitioned

# TODO: count_aggregated

# TODO: count_aggregated_cube

# count_multi_expr
query I
SELECT count(c1, c2) FROM test
----
3

# count_null
query III
SELECT count(null), count(null, null), count(distinct null) FROM test
----
0 0 0

# count_multi_expr_group_by
query I
SELECT count(c1, c2) FROM test group by c1 order by c1
----
0
1
2
0

# count_null_group_by
query III
SELECT count(null), count(null, null), count(distinct null) FROM test group by c1 order by c1
----
0 0 0
0 0 0
0 0 0
0 0 0

# aggreggte_with_alias
query II
select c1, sum(c2) as `Total Salary` from test group by c1 order by c1
----
0 NULL
1 1
3 4
NULL 1

# simple_avg

query R
select avg(c1) from test
----
1.75

# simple_mean
query R
select mean(c1) from test
----
1.75


# query_sum_distinct - 2 different aggregate functions: avg and sum(distinct)
query RI
SELECT AVG(c1), SUM(DISTINCT c2) FROM test
----
1.75 3

# query_sum_distinct - 2 sum(distinct) functions
query II
SELECT SUM(DISTINCT c1), SUM(DISTINCT c2) FROM test
----
4 3

# # query_count_distinct
query I
SELECT COUNT(DISTINCT c1) FROM test
----
3

# TODO: count_distinct_integers_aggregated_single_partition

# TODO: count_distinct_integers_aggregated_multiple_partitions

# TODO: aggregate_with_alias

# array_agg_zero
query ?
SELECT ARRAY_AGG([])
----
[[]]

# array_agg_one
query ?
SELECT ARRAY_AGG([1])
----
[[1]]

# test_approx_percentile_cont_decimal_support
query TI
SELECT c1, approx_percentile_cont(c2, cast(0.85 as decimal(10,2))) apc FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
----
a 4
b 5
c 4
d 4
e 4


# array_agg_zero
query ?
SELECT ARRAY_AGG([]);
----
[[]]

# array_agg_one
query ?
SELECT ARRAY_AGG([1]);
----
[[1]]

# variance_single_value
query RRRR
select var(sq.column1), var_pop(sq.column1), stddev(sq.column1), stddev_pop(sq.column1) from (values (1.0)) as sq;
----
NULL 0 NULL 0

# variance_two_values
query RRRR
select var(sq.column1), var_pop(sq.column1), stddev(sq.column1), stddev_pop(sq.column1) from (values (1.0), (3.0)) as sq;
----
2 1 1.414213562373 1


# aggregates on empty tables
statement ok
CREATE TABLE empty (column1 bigint, column2 int);

# no group by column
query IIRIIIII
SELECT
  count(column1), -- counts should be zero, even for nulls
  sum(column1),   -- other aggregates should be null
  avg(column1),
  min(column1),
  max(column1),
  bit_and(column1),
  bit_or(column1),
  bit_xor(column1)
FROM empty
----
0 NULL NULL NULL NULL NULL NULL NULL

# Same query but with grouping (no groups, so no output)
query IIRIIIIII
SELECT
  count(column1),
  sum(column1),
  avg(column1),
  min(column1),
  max(column1),
  bit_and(column1),
  bit_or(column1),
  bit_xor(column1),
  column2
FROM empty
GROUP BY column2
ORDER BY column2;
----


statement ok
drop table empty

# aggregates on all nulls
statement ok
CREATE TABLE the_nulls
AS VALUES
  (null::bigint, 1),
  (null::bigint, 1),
  (null::bigint, 2);

query II
select * from the_nulls
----
NULL 1
NULL 1
NULL 2

# no group by column
query IIRIIIII
SELECT
  count(column1), -- counts should be zero, even for nulls
  sum(column1),   -- other aggregates should be null
  avg(column1),
  min(column1),
  max(column1),
  bit_and(column1),
  bit_or(column1),
  bit_xor(column1)
FROM the_nulls
----
0 NULL NULL NULL NULL NULL NULL NULL

# Same query but with grouping
query IIRIIIIII
SELECT
  count(column1), -- counts should be zero, even for nulls
  sum(column1),   -- other aggregates should be null
  avg(column1),
  min(column1),
  max(column1),
  bit_and(column1),
  bit_or(column1),
  bit_xor(column1),
  column2
FROM the_nulls
GROUP BY column2
ORDER BY column2;
----
0 NULL NULL NULL NULL NULL NULL NULL 1
0 NULL NULL NULL NULL NULL NULL NULL 2


statement ok
drop table the_nulls;

statement ok
create table bit_aggregate_functions (
  c1 SMALLINT NOT NULL,
  c2 SMALLINT NOT NULL,
  c3 SMALLINT,
  tag varchar
)
as values
  (5,  10, 11,   'A'),
  (33, 11, null, 'B'),
  (9,  12, null, 'A');

# query_bit_and, query_bit_or, query_bit_xor
query IIIIIIIII
SELECT
  bit_and(c1),
  bit_and(c2),
  bit_and(c3),
  bit_or(c1),
  bit_or(c2),
  bit_or(c3),
  bit_xor(c1),
  bit_xor(c2),
  bit_xor(c3)
FROM bit_aggregate_functions
----
1 8 11 45 15 11 45 13 11

# query_bit_and, query_bit_or, query_bit_xor, with group
query IIIIIIIIIT
SELECT
  bit_and(c1),
  bit_and(c2),
  bit_and(c3),
  bit_or(c1),
  bit_or(c2),
  bit_or(c3),
  bit_xor(c1),
  bit_xor(c2),
  bit_xor(c3),
  tag
FROM bit_aggregate_functions
GROUP BY tag
ORDER BY tag
----
1 8 11 13 14 11 12 6 11 A
33 11 NULL 33 11 NULL 33 11 NULL B


statement ok
create table bool_aggregate_functions (
  c1 boolean not null,
  c2 boolean not null,
  c3 boolean not null,
  c4 boolean not null,
  c5 boolean,
  c6 boolean,
  c7 boolean,
  c8 boolean,
)
as values
  (true, true, false, false, true, true, null, null),
  (true, false, true, false, false, null, false, null),
  (true, true, false, false, null, true, false, null);

# query_bool_and
query BBBBBBBB
SELECT bool_and(c1), bool_and(c2), bool_and(c3), bool_and(c4), bool_and(c5), bool_and(c6), bool_and(c7), bool_and(c8) FROM bool_aggregate_functions
----
true false false false false true false NULL

# query_bool_and_distinct
query BBBBBBBB
SELECT bool_and(distinct c1), bool_and(distinct c2), bool_and(distinct c3), bool_and(distinct c4), bool_and(distinct c5), bool_and(distinct c6), bool_and(distinct c7), bool_and(distinct c8) FROM bool_aggregate_functions
----
true false false false false true false NULL

# query_bool_or
query BBBBBBBB
SELECT bool_or(c1), bool_or(c2), bool_or(c3), bool_or(c4), bool_or(c5), bool_or(c6), bool_or(c7), bool_or(c8) FROM bool_aggregate_functions
----
true true true false true true false NULL

# query_bool_or_distinct
query BBBBBBBB
SELECT bool_or(distinct c1), bool_or(distinct c2), bool_or(distinct c3), bool_or(distinct c4), bool_or(distinct c5), bool_or(distinct c6), bool_or(distinct c7), bool_or(distinct c8) FROM bool_aggregate_functions
----
true true true false true true false NULL

# All supported timestamp types

# "nanos" --> TimestampNanosecondArray
# "micros" --> TimestampMicrosecondArray
# "millis" --> TimestampMillisecondArray
# "secs" --> TimestampSecondArray
# "names" --> StringArray

statement ok
create table t_source
as values
 ('2018-11-13T17:11:10.011375885995', 'Row 0', 'X'),
 ('2011-12-13T11:13:10.12345',        'Row 1', 'X'),
 (null,                               'Row 2', 'Y'),
 ('2021-01-01T05:11:10.432',          'Row 3', 'Y');

statement ok
create table t as
select
  arrow_cast(column1, 'Timestamp(Nanosecond, None)') as nanos,
  arrow_cast(column1, 'Timestamp(Microsecond, None)') as micros,
  arrow_cast(column1, 'Timestamp(Millisecond, None)') as millis,
  arrow_cast(column1, 'Timestamp(Second, None)') as secs,
  column2 as names,
  column3 as tag
from t_source;

# Demonstate the contents
query PPPPTT
select * from t;
----
2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0 X
2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 Row 1 X
NULL NULL NULL NULL Row 2 Y
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3 Y


# aggregate_timestamps_sum
statement error DataFusion error: Error during planning: No function matches the given name and argument types 'SUM\(Timestamp\(Nanosecond, None\)\)'\. You might need to add explicit type casts\.
SELECT sum(nanos), sum(micros), sum(millis), sum(secs) FROM t;

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'SUM\(Timestamp\(Nanosecond, None\)\)'\. You might need to add explicit type casts\.
SELECT tag, sum(nanos), sum(micros), sum(millis), sum(secs) FROM t GROUP BY tag ORDER BY tag;

# aggregate_timestamps_count
query IIII
SELECT count(nanos), count(micros), count(millis), count(secs) FROM t;
----
3 3 3 3

query TIIII
SELECT tag, count(nanos), count(micros), count(millis), count(secs) FROM t GROUP BY tag ORDER BY tag;
----
X 2 2 2 2
Y 1 1 1 1

# aggregate_timestamps_min
query PPPP
SELECT min(nanos), min(micros), min(millis), min(secs) FROM t;
----
2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10

query TPPPP
SELECT tag, min(nanos), min(micros), min(millis), min(secs) FROM t GROUP BY tag ORDER BY tag;
----
X 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10
Y 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10

# aggregate_timestamps_max
query PPPP
SELECT max(nanos), max(micros), max(millis), max(secs) FROM t;
----
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10

query TPPPP
SELECT tag, max(nanos), max(micros), max(millis), max(secs) FROM t GROUP BY tag ORDER BY tag
----
X 2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10
Y 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10


# aggregate_timestamps_avg
statement error DataFusion error: Error during planning: No function matches the given name and argument types 'AVG\(Timestamp\(Nanosecond, None\)\)'\. You might need to add explicit type casts\.
SELECT avg(nanos), avg(micros), avg(millis), avg(secs) FROM t

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'AVG\(Timestamp\(Nanosecond, None\)\)'\. You might need to add explicit type casts\.
SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY tag ORDER BY tag;


statement ok
drop table t_source;

statement ok
drop table t;


# All supported Date tpes

# "date32" --> Date32Array
# "date64" --> Date64Array
# "names" --> StringArray

statement ok
create table t_source
as values
 ('2018-11-13', 'Row 0', 'X'),
 ('2011-12-13', 'Row 1', 'X'),
 (null,         'Row 2', 'Y'),
 ('2021-01-01', 'Row 3', 'Y');

statement ok
create table t as
select
  arrow_cast(column1, 'Date32') as date32,
  -- Workaround https://github.com/apache/arrow-rs/issues/4512 is fixed, can use this
  -- arrow_cast(column1, 'Date64') as date64,
  arrow_cast(arrow_cast(column1, 'Date32'), 'Date64') as date64,
  column2 as names,
  column3 as tag
from t_source;

# Demonstate the contents
query DDTT
select * from t;
----
2018-11-13 2018-11-13T00:00:00 Row 0 X
2011-12-13 2011-12-13T00:00:00 Row 1 X
NULL NULL Row 2 Y
2021-01-01 2021-01-01T00:00:00 Row 3 Y


# aggregate_timestamps_sum
statement error DataFusion error: Error during planning: No function matches the given name and argument types 'SUM\(Date32\)'\. You might need to add explicit type casts\.
SELECT sum(date32), sum(date64) FROM t;

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'SUM\(Date32\)'\. You might need to add explicit type casts\.
SELECT tag, sum(date32), sum(date64) FROM t GROUP BY tag ORDER BY tag;

# aggregate_timestamps_count
query II
SELECT count(date32), count(date64) FROM t;
----
3 3

query TII
SELECT tag, count(date32), count(date64) FROM t GROUP BY tag ORDER BY tag;
----
X 2 2
Y 1 1

# aggregate_timestamps_min
query DD
SELECT min(date32), min(date64) FROM t;
----
2011-12-13 2011-12-13T00:00:00

query TDD
SELECT tag, min(date32), min(date64) FROM t GROUP BY tag ORDER BY tag;
----
X 2011-12-13 2011-12-13T00:00:00
Y 2021-01-01 2021-01-01T00:00:00

# aggregate_timestamps_max
query DD
SELECT max(date32), max(date64) FROM t;
----
2021-01-01 2021-01-01T00:00:00

query TDD
SELECT tag, max(date32), max(date64) FROM t GROUP BY tag ORDER BY tag
----
X 2018-11-13 2018-11-13T00:00:00
Y 2021-01-01 2021-01-01T00:00:00


# aggregate_timestamps_avg
statement error DataFusion error: Error during planning: No function matches the given name and argument types 'AVG\(Date32\)'\. You might need to add explicit type casts\.
SELECT avg(date32), avg(date64) FROM t

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'AVG\(Date32\)'\. You might need to add explicit type casts\.
SELECT tag, avg(date32), avg(date64) FROM t GROUP BY tag ORDER BY tag;


statement ok
drop table t_source;

statement ok
drop table t;


# All supported time types

# Columns are named:
# "nanos" --> Time64NanosecondArray
# "micros" --> Time64MicrosecondArray
# "millis" --> Time32MillisecondArray
# "secs" --> Time32SecondArray
# "names" --> StringArray

statement ok
create table t_source
as values
 ('18:06:30.243620451', 'Row 0', 'A'),
 ('20:08:28.161121654', 'Row 1', 'A'),
 ('19:11:04.156423842', 'Row 2', 'B'),
 ('21:06:28.247821084', 'Row 3', 'B');


statement ok
create table t as
select
  arrow_cast(column1, 'Time64(Nanosecond)') as nanos,
  arrow_cast(column1, 'Time64(Microsecond)') as micros,
  arrow_cast(column1, 'Time32(Millisecond)') as millis,
  arrow_cast(column1, 'Time32(Second)') as secs,
  column2 as names,
  column3 as tag
from t_source;

# Demonstate the contents
query DDDDTT
select * from t;
----
18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30 Row 0 A
20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28 Row 1 A
19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04 Row 2 B
21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28 Row 3 B

# aggregate_times_sum
statement error DataFusion error: Error during planning: No function matches the given name and argument types 'SUM\(Time64\(Nanosecond\)\)'\. You might need to add explicit type casts\.
SELECT sum(nanos), sum(micros), sum(millis), sum(secs) FROM t

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'SUM\(Time64\(Nanosecond\)\)'\. You might need to add explicit type casts\.
SELECT tag, sum(nanos), sum(micros), sum(millis), sum(secs) FROM t GROUP BY tag ORDER BY tag

# aggregate_times_count
query IIII
SELECT count(nanos), count(micros), count(millis), count(secs) FROM t
----
4 4 4 4

query TIIII
SELECT tag, count(nanos), count(micros), count(millis), count(secs) FROM t GROUP BY tag ORDER BY tag
----
A 2 2 2 2
B 2 2 2 2


# aggregate_times_min
query DDDD
SELECT min(nanos), min(micros), min(millis), min(secs) FROM t
----
18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30

query TDDDD
SELECT tag, min(nanos), min(micros), min(millis), min(secs) FROM t GROUP BY tag ORDER BY tag
----
A 18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30
B 19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04

# aggregate_times_max
query DDDD
SELECT max(nanos), max(micros), max(millis), max(secs) FROM t
----
21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28

query TDDDD
SELECT tag, max(nanos), max(micros), max(millis), max(secs) FROM t GROUP BY tag ORDER BY tag
----
A 20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28
B 21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28


# aggregate_times_avg
statement error DataFusion error: Error during planning: No function matches the given name and argument types 'AVG\(Time64\(Nanosecond\)\)'\. You might need to add explicit type casts\.
SELECT avg(nanos), avg(micros), avg(millis), avg(secs) FROM t

statement error DataFusion error: Error during planning: No function matches the given name and argument types 'AVG\(Time64\(Nanosecond\)\)'\. You might need to add explicit type casts\.
SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY tag ORDER BY tag;

statement ok
drop table t_source;

statement ok
drop table t;


# aggregates on strings
statement ok
create table t_source
as values
 ('Foo', 1),
 ('Bar', 2),
 (null,  2),
 ('Baz', 1);

statement ok
create table t as
select
  arrow_cast(column1, 'Utf8') as utf8,
  arrow_cast(column1, 'LargeUtf8') as largeutf8,
  column2 as tag
from t_source;

# No groupy
query TTITTI
SELECT
  min(utf8),
  max(utf8),
  count(utf8),
  min(largeutf8),
  max(largeutf8),
  count(largeutf8)
FROM t
----
Bar Foo 3 Bar Foo 3


# with groupby
query TTITTI
SELECT
  min(utf8),
  max(utf8),
  count(utf8),
  min(largeutf8),
  max(largeutf8),
  count(largeutf8)
FROM t
GROUP BY tag
ORDER BY tag
----
Baz Foo 2 Baz Foo 2
Bar Bar 1 Bar Bar 1


statement ok
drop table t_source;

statement ok
drop table t;


# aggregates on binary
statement ok
create table t_source
as values
 ('Foo', 1),
 ('Bar', 2),
 (null,  2),
 ('Baz', 1);

statement ok
create table t as
select
  arrow_cast(column1, 'Binary') as binary,
  arrow_cast(column1, 'LargeBinary') as largebinary,
  column2 as tag
from t_source;

# No groupy
query ??I??I
SELECT
  min(binary),
  max(binary),
  count(binary),
  min(largebinary),
  max(largebinary),
  count(largebinary)
FROM t
----
426172 466f6f 3 426172 466f6f 3

# with groupby
query ??I??I
SELECT
  min(binary),
  max(binary),
  count(binary),
  min(largebinary),
  max(largebinary),
  count(largebinary)
FROM t
GROUP BY tag
ORDER BY tag
----
42617a 466f6f 2 42617a 466f6f 2
426172 426172 1 426172 426172 1


statement ok
drop table t_source;

statement ok
drop table t;


query I
select median(a) from (select 1 as a where 1=0);
----
NULL

query error DataFusion error: Execution error: aggregate function needs at least one non-null element
select approx_median(a) from (select 1 as a where 1=0);


# aggregate_decimal_sum
query RT
select sum(c1), arrow_typeof(sum(c1)) from d_table;
----
100 Decimal128(20, 3)

query TRT
select c2, sum(c1), arrow_typeof(sum(c1)) from d_table GROUP BY c2 ORDER BY c2;
----
A 1100.045 Decimal128(20, 3)
B -1000.045 Decimal128(20, 3)


# aggregate_decimal_avg
query RT
select avg(c1), arrow_typeof(avg(c1)) from d_table
----
5 Decimal128(14, 7)

query TRT
select c2, avg(c1), arrow_typeof(avg(c1)) from d_table GROUP BY c2 ORDER BY c2
----
A 110.0045 Decimal128(14, 7)
B -100.0045 Decimal128(14, 7)

# Use PostgresSQL dialect
statement ok
set datafusion.sql_parser.dialect = 'Postgres';

# Creating the table
statement ok
CREATE TABLE test_table (c1 INT, c2 INT, c3 INT)

# Inserting data
statement ok
INSERT INTO test_table VALUES
  (1, 10, 50),
  (1, 20, 60),
  (2, 10, 70),
  (2, 20, 80),
  (3, 10, NULL)

# query_group_by_with_filter
query III rowsort
SELECT
  c1,
  SUM(c2) FILTER (WHERE c2 >= 20),
  SUM(c2) FILTER (WHERE c2 < 1) -- no rows pass filter, so the output should be NULL
FROM test_table GROUP BY c1
----
1 20 NULL
2 20 NULL
3 NULL NULL

# query_group_by_avg_with_filter
query IRR rowsort
SELECT
  c1,
  AVG(c2) FILTER (WHERE c2 >= 20),
  AVG(c2) FILTER (WHERE c2 < 1)  -- no rows pass filter, so output should be null
FROM test_table GROUP BY c1
----
1 20 NULL
2 20 NULL
3 NULL NULL

# query_group_by_with_multiple_filters
query IIR rowsort
SELECT
  c1,
  SUM(c2) FILTER (WHERE c2 >= 20) AS sum_c2,
  AVG(c3) FILTER (WHERE c3 <= 70) AS avg_c3
FROM test_table GROUP BY c1
----
1 20 55
2 20 70
3 NULL NULL

# query_group_by_distinct_with_filter
query II rowsort
SELECT
  c1,
  COUNT(DISTINCT c2) FILTER (WHERE c2 >= 20) AS distinct_c2_count
FROM test_table GROUP BY c1
----
1 1
2 1
3 0

# query_without_group_by_with_filter
query I rowsort
SELECT
  SUM(c2) FILTER (WHERE c2 >= 20) AS sum_c2
FROM test_table
----
40

# count_without_group_by_with_filter
query I rowsort
SELECT
  COUNT(c2) FILTER (WHERE c2 >= 20) AS count_c2
FROM test_table
----
2

# query_with_and_without_filter
query III rowsort
SELECT
  c1,
  SUM(c2) FILTER (WHERE c2 >= 20) as result,
  SUM(c2) as result_no_filter
FROM test_table GROUP BY c1;
----
1 20 30
2 20 30
3 NULL 10

# query_filter_on_different_column_than_aggregate
query I rowsort
select
  sum(c1) FILTER (WHERE c2 < 30)
FROM test_table;
----
9

# query_test_empty_filter
query I rowsort
SELECT
  SUM(c2) FILTER (WHERE c2 >= 20000000) AS sum_c2
FROM test_table;
----
NULL

# Creating the decimal table
statement ok
CREATE TABLE test_decimal_table (c1 INT, c2 DECIMAL(5, 2), c3 DECIMAL(5, 1), c4 DECIMAL(5, 1))

# Inserting data
statement ok
INSERT INTO test_decimal_table VALUES (1, 10.10, 100.1, NULL), (1, 20.20, 200.2, NULL), (2, 10.10, 700.1, NULL), (2, 20.20, 700.1, NULL), (3, 10.1, 100.1, NULL), (3, 10.1, NULL, NULL)

# aggregate_decimal_with_group_by
query IIRRRRIIR rowsort
select c1, count(c2), avg(c2), sum(c2), min(c2), max(c2), count(c3), count(c4), sum(c4) from test_decimal_table group by c1
----
1 2 15.15 30.3 10.1 20.2 2 0 NULL
2 2 15.15 30.3 10.1 20.2 2 0 NULL
3 2 10.1 20.2 10.1 10.1 1 0 NULL

# aggregate_decimal_with_group_by_decimal
query RIRRRRIR rowsort
select c3, count(c2), avg(c2), sum(c2), min(c2), max(c2), count(c4), sum(c4) from test_decimal_table group by c3
----
100.1 2 10.1 20.2 10.1 10.1 0 NULL
200.2 1 20.2 20.2 20.2 20.2 0 NULL
700.1 2 15.15 30.3 10.1 20.2 0 NULL
NULL 1 10.1 10.1 10.1 10.1 0 NULL

# Restore the default dialect
statement ok
set datafusion.sql_parser.dialect = 'Generic';

## Multiple distinct aggregates and dictionaries
statement ok
create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)'));

query I?
select * from dict_test;
----
1 foo
2 bar

query II
select count(distinct column1), count(distinct column2) from dict_test group by column1;
----
1 1
1 1

statement ok
drop table dict_test;


# Prepare the table with dictionary values for testing
statement ok
CREATE TABLE value(x bigint) AS VALUES (1), (2), (3), (1), (3), (4), (5), (2);

statement ok
CREATE TABLE value_dict AS SELECT arrow_cast(x, 'Dictionary(Int64, Int32)') AS x_dict FROM value;

query ?
select x_dict from value_dict;
----
1
2
3
1
3
4
5
2

query I
select sum(x_dict) from value_dict;
----
21

query R
select avg(x_dict) from value_dict;
----
2.625

query I
select min(x_dict) from value_dict;
----
1

query I
select max(x_dict) from value_dict;
----
5

query I
select sum(x_dict) from value_dict where x_dict > 3;
----
9

query R
select avg(x_dict) from value_dict where x_dict > 3;
----
4.5

query I
select min(x_dict) from value_dict where x_dict > 3;
----
4

query I
select max(x_dict) from value_dict where x_dict > 3;
----
5

query I
select sum(x_dict) from value_dict group by x_dict % 2 order by sum(x_dict);
----
8
13

query R
select avg(x_dict) from value_dict group by x_dict % 2 order by avg(x_dict);
----
2.6
2.666666666667

query I
select min(x_dict) from value_dict group by x_dict % 2 order by min(x_dict);
----
1
2

query I
select max(x_dict) from value_dict group by x_dict % 2 order by max(x_dict);
----
4
5

query T
select arrow_typeof(x_dict) from value_dict group by x_dict;
----
Dictionary(Int64, Int32)
Dictionary(Int64, Int32)
Dictionary(Int64, Int32)
Dictionary(Int64, Int32)
Dictionary(Int64, Int32)

statement ok
drop table value

statement ok
drop table value_dict


# bool aggregation
statement ok
CREATE TABLE value_bool(x boolean, g int) AS VALUES (NULL, 0), (false, 0), (true, 0), (false, 1), (true, 2), (NULL, 3);

query B
select min(x) from value_bool;
----
false

query B
select max(x) from value_bool;
----
true

query B
select min(x) from value_bool group by g order by g;
----
false
false
true
NULL

query B
select max(x) from value_bool group by g order by g;
----
true
false
true
NULL

#
# Push limit into distinct group-by aggregation tests
#

# Make results deterministic
statement ok
set datafusion.optimizer.repartition_aggregations = false;

#
query TT
EXPLAIN SELECT DISTINCT c3 FROM aggregate_test_100 group by c3 limit 5;
----
logical_plan
Limit: skip=0, fetch=5
--Aggregate: groupBy=[[aggregate_test_100.c3]], aggr=[[]]
----Aggregate: groupBy=[[aggregate_test_100.c3]], aggr=[[]]
------TableScan: aggregate_test_100 projection=[c3]
physical_plan
GlobalLimitExec: skip=0, fetch=5
--AggregateExec: mode=Final, gby=[c3@0 as c3], aggr=[], lim=[5]
----CoalescePartitionsExec
------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[], lim=[5]
--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
----------AggregateExec: mode=Final, gby=[c3@0 as c3], aggr=[], lim=[5]
------------CoalescePartitionsExec
--------------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[], lim=[5]
----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3], has_header=true

query I
SELECT DISTINCT c3 FROM aggregate_test_100 group by c3 limit 5;
----
1
-40
29
-85
-82

query TT
EXPLAIN SELECT c2, c3 FROM aggregate_test_100 group by c2, c3 limit 5 offset 4;
----
logical_plan
Limit: skip=4, fetch=5
--Aggregate: groupBy=[[aggregate_test_100.c2, aggregate_test_100.c3]], aggr=[[]]
----TableScan: aggregate_test_100 projection=[c2, c3]
physical_plan
GlobalLimitExec: skip=4, fetch=5
--AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[], lim=[9]
----CoalescePartitionsExec
------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[], lim=[9]
--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true

query II
SELECT c2, c3 FROM aggregate_test_100 group by c2, c3 limit 5 offset 4;
----
5 -82
4 -111
3 104
3 13
1 38

# The limit should only apply to the aggregations which group by c3
query TT
EXPLAIN SELECT DISTINCT c3 FROM aggregate_test_100 WHERE c3 between 10 and 20 group by c2, c3 limit 4;
----
logical_plan
Limit: skip=0, fetch=4
--Aggregate: groupBy=[[aggregate_test_100.c3]], aggr=[[]]
----Projection: aggregate_test_100.c3
------Aggregate: groupBy=[[aggregate_test_100.c2, aggregate_test_100.c3]], aggr=[[]]
--------Filter: aggregate_test_100.c3 >= Int16(10) AND aggregate_test_100.c3 <= Int16(20)
----------TableScan: aggregate_test_100 projection=[c2, c3], partial_filters=[aggregate_test_100.c3 >= Int16(10), aggregate_test_100.c3 <= Int16(20)]
physical_plan
GlobalLimitExec: skip=0, fetch=4
--AggregateExec: mode=Final, gby=[c3@0 as c3], aggr=[], lim=[4]
----CoalescePartitionsExec
------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[], lim=[4]
--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
----------ProjectionExec: expr=[c3@1 as c3]
------------AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[]
--------------CoalescePartitionsExec
----------------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[]
------------------CoalesceBatchesExec: target_batch_size=8192
--------------------FilterExec: c3@1 >= 10 AND c3@1 <= 20
----------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true

query I
SELECT DISTINCT c3 FROM aggregate_test_100 WHERE c3 between 10 and 20 group by c2, c3 limit 4;
----
13
17
12
14

# An aggregate expression causes the limit to not be pushed to the aggregation
query TT
EXPLAIN SELECT max(c1), c2, c3 FROM aggregate_test_100 group by c2, c3 limit 5;
----
logical_plan
Projection: MAX(aggregate_test_100.c1), aggregate_test_100.c2, aggregate_test_100.c3
--Limit: skip=0, fetch=5
----Aggregate: groupBy=[[aggregate_test_100.c2, aggregate_test_100.c3]], aggr=[[MAX(aggregate_test_100.c1)]]
------TableScan: aggregate_test_100 projection=[c1, c2, c3]
physical_plan
ProjectionExec: expr=[MAX(aggregate_test_100.c1)@2 as MAX(aggregate_test_100.c1), c2@0 as c2, c3@1 as c3]
--GlobalLimitExec: skip=0, fetch=5
----AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[MAX(aggregate_test_100.c1)]
------CoalescePartitionsExec
--------AggregateExec: mode=Partial, gby=[c2@1 as c2, c3@2 as c3], aggr=[MAX(aggregate_test_100.c1)]
----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], has_header=true

# TODO(msirek): Extend checking in LimitedDistinctAggregation equal groupings to ignore the order of columns
# in the group-by column lists, so the limit could be pushed to the lowest AggregateExec in this case
query TT
EXPLAIN SELECT DISTINCT c3, c2 FROM aggregate_test_100 group by c2, c3 limit 3 offset 10;
----
logical_plan
Limit: skip=10, fetch=3
--Aggregate: groupBy=[[aggregate_test_100.c3, aggregate_test_100.c2]], aggr=[[]]
----Projection: aggregate_test_100.c3, aggregate_test_100.c2
------Aggregate: groupBy=[[aggregate_test_100.c2, aggregate_test_100.c3]], aggr=[[]]
--------TableScan: aggregate_test_100 projection=[c2, c3]
physical_plan
GlobalLimitExec: skip=10, fetch=3
--AggregateExec: mode=Final, gby=[c3@0 as c3, c2@1 as c2], aggr=[], lim=[13]
----CoalescePartitionsExec
------AggregateExec: mode=Partial, gby=[c3@0 as c3, c2@1 as c2], aggr=[], lim=[13]
--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
----------ProjectionExec: expr=[c3@1 as c3, c2@0 as c2]
------------AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[]
--------------CoalescePartitionsExec
----------------AggregateExec: mode=Partial, gby=[c2@0 as c2, c3@1 as c3], aggr=[]
------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
--------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true

query II
SELECT DISTINCT c3, c2 FROM aggregate_test_100 group by c2, c3 limit 3 offset 10;
----
57 1
-54 4
112 3

query TT
EXPLAIN SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3;
----
logical_plan
Limit: skip=0, fetch=3
--Aggregate: groupBy=[[ROLLUP (aggregate_test_100.c2, aggregate_test_100.c3)]], aggr=[[]]
----TableScan: aggregate_test_100 projection=[c2, c3]
physical_plan
GlobalLimitExec: skip=0, fetch=3
--AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[], lim=[3]
----CoalescePartitionsExec
------AggregateExec: mode=Partial, gby=[(NULL as c2, NULL as c3), (c2@0 as c2, NULL as c3), (c2@0 as c2, c3@1 as c3)], aggr=[]
--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true

query II
SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3;
----
NULL NULL
2 NULL
5 NULL


statement ok
set datafusion.optimizer.enable_distinct_aggregation_soft_limit = false;

# The limit should not be pushed into the aggregations
query TT
EXPLAIN SELECT DISTINCT c3 FROM aggregate_test_100 group by c3 limit 5;
----
logical_plan
Limit: skip=0, fetch=5
--Aggregate: groupBy=[[aggregate_test_100.c3]], aggr=[[]]
----Aggregate: groupBy=[[aggregate_test_100.c3]], aggr=[[]]
------TableScan: aggregate_test_100 projection=[c3]
physical_plan
GlobalLimitExec: skip=0, fetch=5
--AggregateExec: mode=Final, gby=[c3@0 as c3], aggr=[]
----CoalescePartitionsExec
------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[]
--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
----------AggregateExec: mode=Final, gby=[c3@0 as c3], aggr=[]
------------CoalescePartitionsExec
--------------AggregateExec: mode=Partial, gby=[c3@0 as c3], aggr=[]
----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3], has_header=true

statement ok
set datafusion.optimizer.enable_distinct_aggregation_soft_limit = true;

statement ok
set datafusion.optimizer.repartition_aggregations = true;

#
# regr_*() tests
#

# regr_*() invalid input
statement error
select regr_slope();

statement error
select regr_intercept(*);

statement error
select regr_count(*) from aggregate_test_100;

statement error
select regr_r2(1);

statement error
select regr_avgx(1,2,3);

statement error
select regr_avgy(1, 'foo');

statement error
select regr_sxx('foo', 1);

statement error
select regr_syy('foo', 'bar');

statement error
select regr_sxy(NULL, 'bar');


# regr_*() NULL results
query RRRRRRRRR
select regr_slope(1,1), regr_intercept(1,1), regr_count(1,1), regr_r2(1,1), regr_avgx(1,1), regr_avgy(1,1), regr_sxx(1,1), regr_syy(1,1), regr_sxy(1,1);
----
NULL NULL 1 NULL 1 1 0 0 0

query RRRRRRRRR
select regr_slope(1, NULL), regr_intercept(1, NULL), regr_count(1, NULL), regr_r2(1, NULL), regr_avgx(1, NULL), regr_avgy(1, NULL), regr_sxx(1, NULL), regr_syy(1, NULL), regr_sxy(1, NULL);
----
NULL NULL 0 NULL NULL NULL NULL NULL NULL

query RRRRRRRRR
select regr_slope(NULL, 1), regr_intercept(NULL, 1), regr_count(NULL, 1), regr_r2(NULL, 1), regr_avgx(NULL, 1), regr_avgy(NULL, 1), regr_sxx(NULL, 1), regr_syy(NULL, 1), regr_sxy(NULL, 1);
----
NULL NULL 0 NULL NULL NULL NULL NULL NULL

query RRRRRRRRR
select regr_slope(NULL, NULL), regr_intercept(NULL, NULL), regr_count(NULL, NULL), regr_r2(NULL, NULL), regr_avgx(NULL, NULL), regr_avgy(NULL, NULL), regr_sxx(NULL, NULL), regr_syy(NULL, NULL), regr_sxy(NULL, NULL);
----
NULL NULL 0 NULL NULL NULL NULL NULL NULL

query RRRRRRRRR
select regr_slope(column2, column1), regr_intercept(column2, column1), regr_count(column2, column1), regr_r2(column2, column1), regr_avgx(column2, column1), regr_avgy(column2, column1), regr_sxx(column2, column1), regr_syy(column2, column1), regr_sxy(column2, column1) from (values (1,2), (1,4), (1,6));
----
NULL NULL 3 NULL 1 4 0 8 0


# regr_*() basic tests
query RRRRRRRRR
select 
    regr_slope(column2, column1),
    regr_intercept(column2, column1),
    regr_count(column2, column1),
    regr_r2(column2, column1),
    regr_avgx(column2, column1),
    regr_avgy(column2, column1),
    regr_sxx(column2, column1),
    regr_syy(column2, column1),
    regr_sxy(column2, column1)
from (values (1,2), (2,4), (3,6));
----
2 0 3 1 2 4 2 8 4

query RRRRRRRRR
select 
    regr_slope(c12, c11),
    regr_intercept(c12, c11),
    regr_count(c12, c11),
    regr_r2(c12, c11),
    regr_avgx(c12, c11),
    regr_avgy(c12, c11),
    regr_sxx(c12, c11),
    regr_syy(c12, c11),
    regr_sxy(c12, c11)
from aggregate_test_100;
----
0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695


# regr_*() functions ignore NULLs
query RRRRRRRRR
select 
    regr_slope(column2, column1),
    regr_intercept(column2, column1),
    regr_count(column2, column1),
    regr_r2(column2, column1),
    regr_avgx(column2, column1),
    regr_avgy(column2, column1),
    regr_sxx(column2, column1),
    regr_syy(column2, column1),
    regr_sxy(column2, column1)
from (values (1,NULL), (2,4), (3,6));
----
2 0 2 1 2.5 5 0.5 2 1

query RRRRRRRRR
select 
    regr_slope(column2, column1),
    regr_intercept(column2, column1),
    regr_count(column2, column1),
    regr_r2(column2, column1),
    regr_avgx(column2, column1),
    regr_avgy(column2, column1),
    regr_sxx(column2, column1),
    regr_syy(column2, column1),
    regr_sxy(column2, column1)
from (values (1,NULL), (NULL,4), (3,6));
----
NULL NULL 1 NULL 3 6 0 0 0

query RRRRRRRRR
select 
    regr_slope(column2, column1),
    regr_intercept(column2, column1),
    regr_count(column2, column1),
    regr_r2(column2, column1),
    regr_avgx(column2, column1),
    regr_avgy(column2, column1),
    regr_sxx(column2, column1),
    regr_syy(column2, column1),
    regr_sxy(column2, column1)
from (values (1,NULL), (NULL,4), (NULL,NULL));
----
NULL NULL 0 NULL NULL NULL NULL NULL NULL

query TRRRRRRRRR rowsort
select 
    column3, 
    regr_slope(column2, column1),
    regr_intercept(column2, column1),
    regr_count(column2, column1),
    regr_r2(column2, column1),
    regr_avgx(column2, column1),
    regr_avgy(column2, column1),
    regr_sxx(column2, column1),
    regr_syy(column2, column1),
    regr_sxy(column2, column1)
from (values (1,2,'a'), (2,4,'a'), (1,3,'b'), (3,9,'b'), (1,10,'c'), (NULL,100,'c'))
group by column3;
----
a 2 0 2 1 1.5 3 0.5 2 1
b 3 0 2 1 2 6 2 18 6
c NULL NULL 1 NULL 1 10 0 0 0


# regr_*() testing merge_batch() from RegrAccumulator's internal implementation
statement ok
set datafusion.execution.batch_size = 1;

query RRRRRRRRR
select 
    regr_slope(c12, c11),
    regr_intercept(c12, c11),
    regr_count(c12, c11),
    regr_r2(c12, c11),
    regr_avgx(c12, c11),
    regr_avgy(c12, c11),
    regr_sxx(c12, c11),
    regr_syy(c12, c11),
    regr_sxy(c12, c11)
from aggregate_test_100;
----
0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695

statement ok
set datafusion.execution.batch_size = 2;

query RRRRRRRRR
select 
    regr_slope(c12, c11),
    regr_intercept(c12, c11),
    regr_count(c12, c11),
    regr_r2(c12, c11),
    regr_avgx(c12, c11),
    regr_avgy(c12, c11),
    regr_sxx(c12, c11),
    regr_syy(c12, c11),
    regr_sxy(c12, c11)
from aggregate_test_100;
----
0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695

statement ok
set datafusion.execution.batch_size = 3;

query RRRRRRRRR
select 
    regr_slope(c12, c11),
    regr_intercept(c12, c11),
    regr_count(c12, c11),
    regr_r2(c12, c11),
    regr_avgx(c12, c11),
    regr_avgy(c12, c11),
    regr_sxx(c12, c11),
    regr_syy(c12, c11),
    regr_sxy(c12, c11)
from aggregate_test_100;
----
0.051534002628 0.48427355347 100 0.001929150558 0.479274948239 0.508972509913 6.707779292571 9.234223721582 0.345678715695

statement ok
set datafusion.execution.batch_size = 8192;


# regr_*() testing retract_batch() from RegrAccumulator's internal implementation
query RRRRRRRRR
SELECT
    regr_slope(column2, column1) OVER w AS slope,
    regr_intercept(column2, column1) OVER w AS intercept,
    regr_count(column2, column1) OVER w AS count,
    regr_r2(column2, column1) OVER w AS r2,
    regr_avgx(column2, column1) OVER w AS avgx,
    regr_avgy(column2, column1) OVER w AS avgy,
    regr_sxx(column2, column1) OVER w AS sxx,
    regr_syy(column2, column1) OVER w AS syy,
    regr_sxy(column2, column1) OVER w AS sxy
FROM (VALUES (1,2), (2,4), (3,6), (4,12), (5,15), (6,18)) AS t(column1, column2)
WINDOW w AS (ORDER BY column1 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW);
----
NULL NULL 1 NULL 1 2 0 0 0
2 0 2 1 1.5 3 0.5 2 1
2 0 3 1 2 4 2 8 4
4 -4.666666666667 3 0.923076923077 3 7.333333333333 2 34.666666666667 8
4.5 -7 3 0.964285714286 4 11 2 42 9
3 0 3 1 5 15 2 18 6

query RRRRRRRRR
SELECT
    regr_slope(column2, column1) OVER w AS slope,
    regr_intercept(column2, column1) OVER w AS intercept,
    regr_count(column2, column1) OVER w AS count,
    regr_r2(column2, column1) OVER w AS r2,
    regr_avgx(column2, column1) OVER w AS avgx,
    regr_avgy(column2, column1) OVER w AS avgy,
    regr_sxx(column2, column1) OVER w AS sxx,
    regr_syy(column2, column1) OVER w AS syy,
    regr_sxy(column2, column1) OVER w AS sxy
FROM (VALUES (1,2), (2,4), (3,6), (3, NULL), (4, NULL), (5,15), (6,18), (7, 21)) AS t(column1, column2)
WINDOW w AS (ORDER BY column1 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW);
----
NULL NULL 1 NULL 1 2 0 0 0
2 0 2 1 1.5 3 0.5 2 1
2 0 3 1 2 4 2 8 4
2 0 2 1 2.5 5 0.5 2 1
NULL NULL 1 NULL 3 6 0 0 0
NULL NULL 1 NULL 5 15 0 0 0
3 0 2 1 5.5 16.5 0.5 4.5 1.5
3 0 3 1 6 18 2 18 6

statement error
SELECT STRING_AGG()

statement error
SELECT STRING_AGG(1,2,3)

statement error
SELECT STRING_AGG(STRING_AGG('a', ','))

query T
SELECT STRING_AGG('a', ',')
----
a

query TTTT
SELECT STRING_AGG('a',','), STRING_AGG('a', NULL), STRING_AGG(NULL, ','), STRING_AGG(NULL, NULL)
----
a a NULL NULL

query TT
select string_agg('', '|'), string_agg('a', '');
----
(empty) a

query T
SELECT STRING_AGG(column1, '|') FROM (values (''), (null), (''));
----
|

statement ok
CREATE TABLE strings(g INTEGER, x VARCHAR, y VARCHAR)

query ITT
INSERT INTO strings VALUES (1,'a','/'), (1,'b','-'), (2,'i','/'), (2,NULL,'-'), (2,'j','+'), (3,'p','/'), (4,'x','/'), (4,'y','-'), (4,'z','+')
----
9

query IT
SELECT g, STRING_AGG(x,'|') FROM strings GROUP BY g ORDER BY g
----
1 a|b
2 i|j
3 p
4 x|y|z

query T
SELECT STRING_AGG(x,',') FROM strings WHERE g > 100
----
NULL

statement ok
drop table strings

query T
WITH my_data as (
SELECT 'text1'::varchar(1000) as my_column union all
SELECT 'text1'::varchar(1000) as my_column union all
SELECT 'text1'::varchar(1000) as my_column
)
SELECT string_agg(my_column,', ') as my_string_agg
FROM my_data
----
text1, text1, text1

query T
WITH my_data as (
SELECT 1 as dummy,  'text1'::varchar(1000) as my_column union all
SELECT 1 as dummy,  'text1'::varchar(1000) as my_column union all
SELECT 1 as dummy,  'text1'::varchar(1000) as my_column
)
SELECT string_agg(my_column,', ') as my_string_agg
FROM my_data
GROUP BY dummy
----
text1, text1, text1


# Queries with nested count(*)

query I
select count(*) from (select count(*) from (select 1));
----
1

query I
select count(*) from (select count(*) a, count(*) b from (select 1));
----
1

# rule `aggregate_statistics` should not optimize MIN/MAX to wrong values on empty relation

statement ok
CREATE TABLE empty(col0 INTEGER);

query I
SELECT MIN(col0) FROM empty WHERE col0=1;
----
NULL

query I
SELECT MAX(col0) FROM empty WHERE col0=1;
----
NULL

query TT
EXPLAIN SELECT MIN(col0) FROM empty;
----
logical_plan
Aggregate: groupBy=[[]], aggr=[[MIN(empty.col0)]]
--TableScan: empty projection=[col0]
physical_plan
ProjectionExec: expr=[NULL as MIN(empty.col0)]
--PlaceholderRowExec

query TT
EXPLAIN SELECT MAX(col0) FROM empty;
----
logical_plan
Aggregate: groupBy=[[]], aggr=[[MAX(empty.col0)]]
--TableScan: empty projection=[col0]
physical_plan
ProjectionExec: expr=[NULL as MAX(empty.col0)]
--PlaceholderRowExec

statement ok
DROP TABLE empty;

statement ok
CREATE TABLE t(col0 INTEGER) as VALUES(2);

query I
SELECT MIN(col0) FROM t WHERE col0=1;
----
NULL

query I
SELECT MAX(col0) FROM t WHERE col0=1;
----
NULL

statement ok
DROP TABLE t;