Skip to content

Commit

Permalink
[improvement](statistics)Multi bucket columns using DUJ1 to collect ndv
Browse files Browse the repository at this point in the history
  • Loading branch information
Jibing-Li authored and gnehil committed Dec 4, 2023
1 parent 181c01b commit 2595bd5
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
Expand Down Expand Up @@ -770,6 +769,7 @@ public void markAutoBucket() {
defaultDistributionInfo.markAutoBucket();
}

@Override
public Set<String> getDistributionColumnNames() {
Set<String> distributionColumnNames = Sets.newHashSet();
if (defaultDistributionInfo instanceof RandomDistributionInfo) {
Expand Down Expand Up @@ -2308,7 +2308,7 @@ public long getDataSize(boolean singleReplica) {
public boolean isDistributionColumn(String columnName) {
Set<String> distributeColumns = getDistributionColumnNames()
.stream().map(String::toLowerCase).collect(Collectors.toSet());
return distributeColumns.contains(columnName.toLowerCase(Locale.ROOT));
return distributeColumns.contains(columnName.toLowerCase());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.doris.thrift.TTableDescriptor;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

Expand Down Expand Up @@ -259,5 +260,9 @@ default boolean isDistributionColumn(String columnName) {
default boolean isPartitionColumn(String columnName) {
return false;
}

default Set<String> getDistributionColumnNames() {
return Sets.newHashSet();
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,13 @@ public long getDataSize(boolean singleReplica) {
@Override
public boolean isDistributionColumn(String columnName) {
return getRemoteTable().getSd().getBucketCols().stream().map(String::toLowerCase)
.collect(Collectors.toSet()).contains(columnName.toLowerCase(Locale.ROOT));
.collect(Collectors.toSet()).contains(columnName.toLowerCase());
}

@Override
public Set<String> getDistributionColumnNames() {
return getRemoteTable().getSd().getBucketCols().stream().map(String::toLowerCase)
.collect(Collectors.toSet());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,9 @@ private void getOrdinaryColumnStats() throws Exception {
String.valueOf(sampleInfo.first * targetRows / StatisticsUtil.getHugeTableSampleRows()));
}
}
// Distribution columns don't fit for DUJ1 estimator, use linear estimator.
if (tbl.isDistributionColumn(col.getName())) {
// Single distribution column is not fit for DUJ1 estimator, use linear estimator.
Set<String> distributionColumns = tbl.getDistributionColumnNames();
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
bucketFlag = true;
sb.append(LINEAR_ANALYZE_TEMPLATE);
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,9 @@ protected void doSample() throws Exception {
}
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql;
// Distribution columns don't fit for DUJ1 estimator, use linear estimator.
if (tbl.isDistributionColumn(col.getName())) {
// Single distribution column is not fit for DUJ1 estimator, use linear estimator.
Set<String> distributionColumns = tbl.getDistributionColumnNames();
if (distributionColumns.size() == 1 && distributionColumns.contains(col.getName().toLowerCase())) {
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,76 @@ public boolean isDistributionColumn(String columnName) {
olapAnalysisTask.doSample();
}

@Test
public void testManualSampleTwoDistributeKey(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, @Mocked OlapTable tableIf)
throws Exception {

new Expectations() {
{
tableIf.getRowCount();
result = 500;
tableIf.getId();
result = 30001;
catalogIf.getId();
result = 10001;
catalogIf.getName();
result = "catalogName";
databaseIf.getId();
result = 20001;
}
};

new MockUp<OlapAnalysisTask>() {
@Mock
public Pair<List<Long>, Long> calcActualSampleTablets() {
return Pair.of(Lists.newArrayList(), 100L);
}

@Mock
public ResultRow collectBasicStat(AutoCloseConnectContext context) {
List<String> values = Lists.newArrayList();
values.add("1");
values.add("2");
return new ResultRow(values);
}

@Mock
public void runQuery(String sql, boolean needEncode) {
Assertions.assertFalse(needEncode);
Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(t1.count) * COUNT(1) / (SUM(t1.count) - SUM(IF(t1.count = 1, 1, 0)) + SUM(IF(t1.count = 1, 1, 0)) * SUM(t1.count) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.count, 0)), 0) * 5.0 as `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() FROM ( SELECT t0.`${colName}` as column_key, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
return;
}
};

new MockUp<StatisticsUtil>() {
@Mock
public AutoCloseConnectContext buildConnectContext(boolean scanLimit) {
return null;
}
};

new MockUp<OlapTable>() {
@Mock
public Set<String> getDistributionColumnNames() {
HashSet<String> cols = Sets.newHashSet();
cols.add("test1");
cols.add("test2");
return cols;
}
};

OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
olapAnalysisTask.col = new Column("test1", PrimitiveType.STRING);
olapAnalysisTask.tbl = tableIf;
AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder();
analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
olapAnalysisTask.info = analysisInfoBuilder.build();
olapAnalysisTask.catalog = catalogIf;
olapAnalysisTask.db = databaseIf;
olapAnalysisTask.tableSample = new TableSample(false, 100L);
olapAnalysisTask.doSample();
}

@Test
public void testNeedLimitFalse(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, @Mocked OlapTable tableIf)
throws Exception {
Expand Down

0 comments on commit 2595bd5

Please sign in to comment.