Skip to content

Commit

Permalink
[improvement](statistics)Use count as ndv for unique/agg olap table s…
Browse files Browse the repository at this point in the history
…ingle key column (#27186)

Single key column of unique/agg olap table has the same value of count and ndv, for this kind of column,
don't need to calculate ndv, simply use count as ndv.
  • Loading branch information
Jibing-Li authored Nov 20, 2023
1 parent 6ed0be8 commit d939903
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ public abstract class BaseAnalysisTask {
+ "${idxId} AS `idx_id`, "
+ "'${colId}' AS `col_id`, "
+ "NULL AS `part_id`, "
+ "ROUND(COUNT(1) * ${scaleFactor}) AS `row_count`, "
+ "ROUND(NDV(`${colName}`) * ${scaleFactor}) as `ndv`, "
+ "${rowCount} AS `row_count`, "
+ "${ndvFunction} as `ndv`, "
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS `null_count`, "
+ "${min} AS `min`, "
+ "${max} AS `max`, "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ private void getOrdinaryColumnStats() throws Exception {
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
bucketFlag = true;
sb.append(LINEAR_ANALYZE_TEMPLATE);
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.doris.statistics;

import org.apache.doris.catalog.KeysType;
import org.apache.doris.catalog.MaterializedIndex;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.Partition;
Expand Down Expand Up @@ -129,21 +130,26 @@ protected void doSample() throws Exception {
}
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql;
// Single distribution column is not fit for DUJ1 estimator, use linear estimator.
Set<String> distributionColumns = tbl.getDistributionColumnNames();
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
if (useLinearAnalyzeTemplate()) {
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
// For single unique key, use count as ndv.
if (isSingleUniqueKey()) {
params.put("ndvFunction", String.valueOf(rowCount));
} else {
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
}
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
} else {
params.put("dataSizeFunction", getDataSizeFunction(col, true));
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
}
LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], "
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}]",
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}], "
+ "is single unique key [{}]",
col.getName(), params.get("rowCount"), rowsToSample, params.get("scaleFactor"),
limitFlag, tbl.isDistributionColumn(col.getName()),
tbl.isPartitionColumn(col.getName()), col.isKey());
tbl.isPartitionColumn(col.getName()), col.isKey(), isSingleUniqueKey());
runQuery(sql, false);
}
}
Expand Down Expand Up @@ -278,4 +284,28 @@ protected long getSampleRows() {
}
return sampleRows;
}

/**
* Check if the task should use linear analyze template.
* @return True for single unique key column and single distribution column.
*/
protected boolean useLinearAnalyzeTemplate() {
if (isSingleUniqueKey()) {
return true;
}
Set<String> distributionColumns = tbl.getDistributionColumnNames();
return distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase());
}

/**
* Check if the olap table has a single unique key.
* @return True if the table has a single unique/agg key. False otherwise.
*/
protected boolean isSingleUniqueKey() {
int keysNum = ((OlapTable) tbl).getKeysNum();
KeysType keysType = ((OlapTable) tbl).getKeysType();
return col.isKey()
&& keysNum == 1
&& (keysType.equals(KeysType.UNIQUE_KEYS) || keysType.equals(KeysType.AGG_KEYS));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ public ResultRow collectBasicStat(AutoCloseConnectContext context) {
@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, ROUND(COUNT(1) * 5.0) AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
return;
}
};
Expand Down

0 comments on commit d939903

Please sign in to comment.