[improvement](statistics)Use count as ndv for unique/agg olap table s…

…ingle key column (#27186) Single key column of unique/agg olap table has the same value of count and ndv, for this kind of column, don't need to calculate ndv, simply use count as ndv.
apache · Nov 20, 2023 · d939903 · d939903
1 parent 6ed0be8
commit d939903
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 8 deletions.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -72,8 +72,8 @@ public abstract class BaseAnalysisTask {
  + "${idxId} AS `idx_id`, "
  + "'${colId}' AS `col_id`, "
  + "NULL AS `part_id`, "
- + "ROUND(COUNT(1) * ${scaleFactor}) AS `row_count`, "
- + "ROUND(NDV(`${colName}`) * ${scaleFactor})  as `ndv`, "
+ + "${rowCount} AS `row_count`, "
+ + "${ndvFunction} as `ndv`, "
  + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS `null_count`, "
  + "${min} AS `min`, "
  + "${max} AS `max`, "

diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -145,6 +145,7 @@ private void getOrdinaryColumnStats() throws Exception {
  if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
  bucketFlag = true;
  sb.append(LINEAR_ANALYZE_TEMPLATE);
+ params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
  params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
  } else {
  sb.append(DUJ1_ANALYZE_TEMPLATE);

diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -17,6 +17,7 @@
 
 package org.apache.doris.statistics;
 
+import org.apache.doris.catalog.KeysType;
 import org.apache.doris.catalog.MaterializedIndex;
 import org.apache.doris.catalog.OlapTable;
 import org.apache.doris.catalog.Partition;
@@ -129,21 +130,26 @@ protected void doSample() throws Exception {
  }
  StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
  String sql;
- // Single distribution column is not fit for DUJ1 estimator, use linear estimator.
- Set<String> distributionColumns = tbl.getDistributionColumnNames();
- if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
+ if (useLinearAnalyzeTemplate()) {
  params.put("min", StatisticsUtil.quote(min));
  params.put("max", StatisticsUtil.quote(max));
+ // For single unique key, use count as ndv.
+ if (isSingleUniqueKey()) {
+ params.put("ndvFunction", String.valueOf(rowCount));
+ } else {
+ params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
+ }
  sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
  } else {
  params.put("dataSizeFunction", getDataSizeFunction(col, true));
  sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
  }
  LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], "
- + "limited [{}], distribute column [{}], partition column [{}], key column [{}]",
+ + "limited [{}], distribute column [{}], partition column [{}], key column [{}], "
+ + "is single unique key [{}]",
  col.getName(), params.get("rowCount"), rowsToSample, params.get("scaleFactor"),
  limitFlag, tbl.isDistributionColumn(col.getName()),
- tbl.isPartitionColumn(col.getName()), col.isKey());
+ tbl.isPartitionColumn(col.getName()), col.isKey(), isSingleUniqueKey());
  runQuery(sql, false);
  }
  }
@@ -278,4 +284,28 @@ protected long getSampleRows() {
  }
  return sampleRows;
  }
+
+ /**
+ * Check if the task should use linear analyze template.
+ * @return True for single unique key column and single distribution column.
+ */
+ protected boolean useLinearAnalyzeTemplate() {
+ if (isSingleUniqueKey()) {
+ return true;
+ }
+ Set<String> distributionColumns = tbl.getDistributionColumnNames();
+ return distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase());
+ }
+
+ /**
+ * Check if the olap table has a single unique key.
+ * @return True if the table has a single unique/agg key. False otherwise.
+ */
+ protected boolean isSingleUniqueKey() {
+ int keysNum = ((OlapTable) tbl).getKeysNum();
+ KeysType keysType = ((OlapTable) tbl).getKeysType();
+ return col.isKey()
+ && keysNum == 1
+ && (keysType.equals(KeysType.UNIQUE_KEYS) || keysType.equals(KeysType.AGG_KEYS));
+ }
 }
diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -218,7 +218,7 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) {
  @Mock
  public void runQuery(String sql, boolean needEncode) {
  Assertions.assertFalse(needEncode);
- Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, ROUND(COUNT(1) * 5.0) AS `row_count`, ROUND(NDV(`${colName}`) * 5.0)  as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
+ Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
  return;
  }
  };