From 2595bd5cb9a2fa0235e1a38b23e230acb948b0e3 Mon Sep 17 00:00:00 2001 From: Jibing-Li <64681310+Jibing-Li@users.noreply.github.com> Date: Tue, 14 Nov 2023 20:14:05 +0800 Subject: [PATCH] [improvement](statistics)Multi bucket columns using DUJ1 to collect ndv #26950 (#26976) backport #26950 --- .../org/apache/doris/catalog/OlapTable.java | 4 +- .../org/apache/doris/catalog/TableIf.java | 5 ++ .../catalog/external/HMSExternalTable.java | 8 ++- .../doris/statistics/HMSAnalysisTask.java | 5 +- .../doris/statistics/OlapAnalysisTask.java | 5 +- .../statistics/OlapAnalysisTaskTest.java | 70 +++++++++++++++++++ 6 files changed, 90 insertions(+), 7 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 3161b5a33714e4..73e0ff7ed9540f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -88,7 +88,6 @@ import java.util.Collections; import java.util.HashMap; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Optional; @@ -770,6 +769,7 @@ public void markAutoBucket() { defaultDistributionInfo.markAutoBucket(); } + @Override public Set getDistributionColumnNames() { Set distributionColumnNames = Sets.newHashSet(); if (defaultDistributionInfo instanceof RandomDistributionInfo) { @@ -2308,7 +2308,7 @@ public long getDataSize(boolean singleReplica) { public boolean isDistributionColumn(String columnName) { Set distributeColumns = getDistributionColumnNames() .stream().map(String::toLowerCase).collect(Collectors.toSet()); - return distributeColumns.contains(columnName.toLowerCase(Locale.ROOT)); + return distributeColumns.contains(columnName.toLowerCase()); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index f39ce5a13e3a2d..3539d17e269a53 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -27,6 +27,7 @@ import org.apache.doris.thrift.TTableDescriptor; import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -259,5 +260,9 @@ default boolean isDistributionColumn(String columnName) { default boolean isPartitionColumn(String columnName) { return false; } + + default Set getDistributionColumnNames() { + return Sets.newHashSet(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index 96e2d573c615b8..2729fdf7a95cb1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -703,7 +703,13 @@ public long getDataSize(boolean singleReplica) { @Override public boolean isDistributionColumn(String columnName) { return getRemoteTable().getSd().getBucketCols().stream().map(String::toLowerCase) - .collect(Collectors.toSet()).contains(columnName.toLowerCase(Locale.ROOT)); + .collect(Collectors.toSet()).contains(columnName.toLowerCase()); + } + + @Override + public Set getDistributionColumnNames() { + return getRemoteTable().getSd().getBucketCols().stream().map(String::toLowerCase) + .collect(Collectors.toSet()); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index e4555dcd8bc96f..812bd615a6949f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -140,8 +140,9 @@ private void getOrdinaryColumnStats() throws Exception { String.valueOf(sampleInfo.first * targetRows / StatisticsUtil.getHugeTableSampleRows())); } } - // Distribution columns don't fit for DUJ1 estimator, use linear estimator. - if (tbl.isDistributionColumn(col.getName())) { + // Single distribution column is not fit for DUJ1 estimator, use linear estimator. + Set distributionColumns = tbl.getDistributionColumnNames(); + if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) { bucketFlag = true; sb.append(LINEAR_ANALYZE_TEMPLATE); params.put("rowCount", "ROUND(count(1) * ${scaleFactor})"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 42b43f52a7fe87..d7037580595adc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -129,8 +129,9 @@ protected void doSample() throws Exception { } StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql; - // Distribution columns don't fit for DUJ1 estimator, use linear estimator. - if (tbl.isDistributionColumn(col.getName())) { + // Single distribution column is not fit for DUJ1 estimator, use linear estimator. + Set distributionColumns = tbl.getDistributionColumnNames(); + if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) { params.put("min", StatisticsUtil.quote(min)); params.put("max", StatisticsUtil.quote(max)); sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 0431e373d48eb1..9437d2d0787660 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -256,6 +256,76 @@ public boolean isDistributionColumn(String columnName) { olapAnalysisTask.doSample(); } + @Test + public void testManualSampleTwoDistributeKey(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, @Mocked OlapTable tableIf) + throws Exception { + + new Expectations() { + { + tableIf.getRowCount(); + result = 500; + tableIf.getId(); + result = 30001; + catalogIf.getId(); + result = 10001; + catalogIf.getName(); + result = "catalogName"; + databaseIf.getId(); + result = 20001; + } + }; + + new MockUp() { + @Mock + public Pair, Long> calcActualSampleTablets() { + return Pair.of(Lists.newArrayList(), 100L); + } + + @Mock + public ResultRow collectBasicStat(AutoCloseConnectContext context) { + List values = Lists.newArrayList(); + values.add("1"); + values.add("2"); + return new ResultRow(values); + } + + @Mock + public void runQuery(String sql, boolean needEncode) { + Assertions.assertFalse(needEncode); + Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(t1.count) * COUNT(1) / (SUM(t1.count) - SUM(IF(t1.count = 1, 1, 0)) + SUM(IF(t1.count = 1, 1, 0)) * SUM(t1.count) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.count, 0)), 0) * 5.0 as `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as column_key, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); + return; + } + }; + + new MockUp() { + @Mock + public AutoCloseConnectContext buildConnectContext(boolean scanLimit) { + return null; + } + }; + + new MockUp() { + @Mock + public Set getDistributionColumnNames() { + HashSet cols = Sets.newHashSet(); + cols.add("test1"); + cols.add("test2"); + return cols; + } + }; + + OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); + olapAnalysisTask.col = new Column("test1", PrimitiveType.STRING); + olapAnalysisTask.tbl = tableIf; + AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder(); + analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL); + olapAnalysisTask.info = analysisInfoBuilder.build(); + olapAnalysisTask.catalog = catalogIf; + olapAnalysisTask.db = databaseIf; + olapAnalysisTask.tableSample = new TableSample(false, 100L); + olapAnalysisTask.doSample(); + } + @Test public void testNeedLimitFalse(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, @Mocked OlapTable tableIf) throws Exception {