From ab2345b7e3f1281e0e5f7e51c9e9b8f70c3464b9 Mon Sep 17 00:00:00 2001 From: Rebecca Taft Date: Tue, 2 Jul 2019 23:07:58 +0300 Subject: [PATCH] opt: fix floating point precision error in statisticsBuilder This commit fixes a floating point precision error in the statisticsBuilder code for estimating the selectivity due to a null-rejecting filter. Prior to this commit, the code was subtracting one from the nullsRemoved estimate if needed to avoid estimating selectivity=0. But the problem is, if nullsRemoved is extremely large (e.g., 2e+20), subtracting 1 does nothing since it's below the precision threshold. This commit changes the logic so now we directly multiply the selectivity by a small number (1e-7) if necessary. This has the same effect as subtracting 1 from nullsRemoved, but without the risk of a floating point error. Fixes #38344 Release note: None --- pkg/sql/opt/memo/statistics_builder.go | 16 ++++----- pkg/sql/opt/memo/testdata/stats/select | 50 ++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/pkg/sql/opt/memo/statistics_builder.go b/pkg/sql/opt/memo/statistics_builder.go index 202c75bff20e..cf750e33dbbd 100644 --- a/pkg/sql/opt/memo/statistics_builder.go +++ b/pkg/sql/opt/memo/statistics_builder.go @@ -2657,12 +2657,12 @@ func (sb *statisticsBuilder) selectivityFromNullCounts( // We want to avoid setting selectivity to zero because the stats may be // stale, and we can end up with weird and inefficient plans if we - // estimate zero rows. Adjust the estimate for nullsRemoved to avoid - // this. + // estimate zero rows. Multiply by a small number instead. if nullsRemoved == rowCount { - nullsRemoved = max(nullsRemoved-1, 0) + selectivity *= 1e-7 + } else { + selectivity *= 1 - nullsRemoved/rowCount } - selectivity *= 1 - nullsRemoved/rowCount } } @@ -2718,12 +2718,12 @@ func (sb *statisticsBuilder) joinSelectivityFromNullCounts( if colStat.NullCount == 0 { // We want to avoid setting selectivity to zero because the stats may be // stale, and we can end up with weird and inefficient plans if we - // estimate zero rows. Adjust the estimate for crossJoinNullCount to - // avoid this. + // estimate zero rows. Multiply by a small number instead. if crossJoinNullCount == inputRowCount { - crossJoinNullCount = max(crossJoinNullCount-1, 0) + selectivity *= 1e-7 + } else { + selectivity *= 1 - crossJoinNullCount/inputRowCount } - selectivity *= 1 - crossJoinNullCount/inputRowCount } } diff --git a/pkg/sql/opt/memo/testdata/stats/select b/pkg/sql/opt/memo/testdata/stats/select index ec09c83c2b12..0e7734f303b1 100644 --- a/pkg/sql/opt/memo/testdata/stats/select +++ b/pkg/sql/opt/memo/testdata/stats/select @@ -1246,12 +1246,12 @@ HAVING project ├── columns: "?column?":5(int!null) ├── cardinality: [0 - 2] - ├── stats: [rows=0.5] + ├── stats: [rows=1e-07] ├── fd: ()-->(5) ├── select │ ├── columns: column2:2(string) column3:3(varbit) min:4(bool!null) │ ├── cardinality: [0 - 2] - │ ├── stats: [rows=0.5, distinct(4)=0.5, null(4)=0] + │ ├── stats: [rows=1e-07, distinct(4)=1e-07, null(4)=0] │ ├── key: (2,3) │ ├── fd: ()-->(4) │ ├── group-by @@ -1342,3 +1342,49 @@ select │ └── stats: [rows=1000, distinct(4)=100, null(4)=0] └── filters └── (d3 >= '1903-10-01') AND (d3 < '2003-10-01') [type=bool, outer=(4), constraints=(/4: [/'1903-10-01' - /'2003-09-30']; tight)] + +# Regression test for #38344. Avoid floating point precision errors. +exec-ddl +CREATE TABLE t38344 (x BOOL) +---- + +exec-ddl +ALTER TABLE t38344 INJECT STATISTICS '[ + { + "columns": ["x"], + "created_at": "2018-01-01 1:00:00.00000+00:00", + "row_count": 20000000000, + "distinct_count": 1, + "null_count": 20000000000 + } +]' +---- + +norm +WITH t(x) AS ( + SELECT (t1.x::int << 5533)::bool OR t2.x AS x + FROM t38344 AS t1 LEFT JOIN t38344 AS t2 ON true +) +SELECT x FROM t WHERE x +---- +select + ├── columns: x:5(bool!null) + ├── stats: [rows=4e+13, distinct(5)=1, null(5)=0] + ├── fd: ()-->(5) + ├── project + │ ├── columns: x:5(bool) + │ ├── stats: [rows=4e+20, distinct(5)=1, null(5)=4e+20] + │ ├── left-join + │ │ ├── columns: t1.x:1(bool) t2.x:3(bool) + │ │ ├── stats: [rows=4e+20, distinct(1,3)=1, null(1,3)=4e+20] + │ │ ├── scan t1 + │ │ │ ├── columns: t1.x:1(bool) + │ │ │ └── stats: [rows=2e+10, distinct(1)=1, null(1)=2e+10] + │ │ ├── scan t2 + │ │ │ ├── columns: t2.x:3(bool) + │ │ │ └── stats: [rows=2e+10, distinct(3)=1, null(3)=2e+10] + │ │ └── filters (true) + │ └── projections + │ └── (t1.x::INT8 << 5533)::BOOL OR t2.x [type=bool, outer=(1,3)] + └── filters + └── variable: x [type=bool, outer=(5), constraints=(/5: [/true - /true]; tight), fd=()-->(5)]