Skip to content

Commit

Permalink
[fix](stats) Store max/min by base64
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikyou1997 authored and 胥剑旭 committed Dec 14, 2023
1 parent b3e5fae commit bd33a16
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ public abstract class BaseAnalysisTask {
protected static final String INSERT_COL_STATISTICS = "INSERT INTO "
+ "${internalDB}.${columnStatTbl}"
+ " SELECT id, catalog_id, db_id, tbl_id, idx_id, col_id, part_id, row_count, "
+ " ndv, null_count, CAST(min AS string), CAST(max AS string), data_size, update_time\n"
+ " ndv, null_count,"
+ " to_base64(CAST(min AS string)), to_base64(CAST(max AS string)), data_size, update_time\n"
+ " FROM \n"
+ " (SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
+ " ${catalogId} AS catalog_id, "
Expand All @@ -89,8 +90,8 @@ public abstract class BaseAnalysisTask {
+ " NULL AS part_id, "
+ " SUM(count) AS row_count, \n"
+ " SUM(null_count) AS null_count, "
+ " MIN(CAST(min AS ${type})) AS min, "
+ " MAX(CAST(max AS ${type})) AS max, "
+ " MIN(CAST(from_base64(min) AS ${type})) AS min, "
+ " MAX(CAST(from_base64(max) AS ${type})) AS max, "
+ " SUM(data_size_in_bytes) AS data_size, "
+ " NOW() AS update_time \n"
+ " FROM ${internalDB}.${columnStatTbl}"
Expand All @@ -114,8 +115,8 @@ public abstract class BaseAnalysisTask {
+ "${row_count} AS row_count, "
+ "${ndv} AS ndv, "
+ "${null_count} AS null_count, "
+ "'${min}' AS min, "
+ "'${max}' AS max, "
+ "to_base64('${min}') AS min, "
+ "to_base64('${max}') AS max, "
+ "${data_size} AS data_size, "
+ "NOW() ";

Expand Down Expand Up @@ -241,7 +242,7 @@ protected String getDataSizeFunction(Column column) {
// Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
protected String getMinFunction() {
if (tableSample == null) {
return "MIN(`${colName}`) ";
return "MIN(CAST(min AS ${type}))";
} else {
return "NULL ";
}
Expand All @@ -250,7 +251,7 @@ protected String getMinFunction() {
// Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan.
protected String getMaxFunction() {
if (tableSample == null) {
return "MAX(`${colName}`) ";
return "MAX(CAST(min AS ${type}))";
} else {
return "NULL ";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import org.apache.doris.statistics.util.StatisticsUtil;

import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.StringJoiner;

/**
Expand Down Expand Up @@ -73,8 +75,10 @@ public String toSQL(boolean roundByParentheses) {
sj.add(String.valueOf(count));
sj.add(String.valueOf(ndv));
sj.add(String.valueOf(nullCount));
sj.add(StatisticsUtil.quote(StatisticsUtil.escapeSQL(minLit)));
sj.add(StatisticsUtil.quote(StatisticsUtil.escapeSQL(maxLit)));
sj.add(minLit == null ? "NULL" :
"'" + Base64.getEncoder().encodeToString(minLit.getBytes(StandardCharsets.UTF_8)) + "'");
sj.add(maxLit == null ? "NULL" :
"'" + Base64.getEncoder().encodeToString(maxLit.getBytes(StandardCharsets.UTF_8)) + "'");
sj.add(String.valueOf(dataSizeInBytes));
sj.add(StatisticsUtil.quote(updateTime));
return sj.toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import org.apache.logging.log4j.Logger;
import org.json.JSONObject;

import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -172,6 +174,9 @@ public static ColumnStatistic fromResultRow(ResultRow row) {
String min = row.get(10);
String max = row.get(11);
if (min != null && !min.equalsIgnoreCase("NULL")) {
min = new String(Base64.getDecoder().decode(min),
StandardCharsets.UTF_8);

try {
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
Expand All @@ -183,6 +188,10 @@ public static ColumnStatistic fromResultRow(ResultRow row) {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
if (max != null && !max.equalsIgnoreCase("NULL")) {

max = new String(Base64.getDecoder().decode(max),
StandardCharsets.UTF_8);

try {
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, "
+ NDV_SAMPLE_TEMPLATE
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, "
+ "${minFunction} AS min, "
+ "${maxFunction} AS max, "
+ "to_base64(${minFunction}) AS min, "
+ "to_base64(${maxFunction}) AS max, "
+ "${dataSizeFunction} * ${scaleFactor} AS data_size, "
+ "NOW() "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}";
Expand All @@ -81,8 +81,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "COUNT(1) AS row_count, "
+ "NDV(`${colName}`) AS ndv, "
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
+ "to_base64(MIN(`${colName}`)) AS min, "
+ "to_base64(MAX(`${colName}`)) AS max, "
+ "${dataSizeFunction} AS data_size, "
+ "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where ";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ public class JdbcAnalysisTask extends BaseAnalysisTask {
+ "COUNT(1) AS row_count, "
+ "NDV(`${colName}`) AS ndv, "
+ "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, "
+ "MIN(`${colName}`) AS min, "
+ "MAX(`${colName}`) AS max, "
+ "to_base64(MIN(`${colName}`)) AS min, "
+ "to_base64(MAX(`${colName}`)) AS max, "
+ "${dataSizeFunction} AS data_size, "
+ "NOW() "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}`";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
Expand Down Expand Up @@ -121,8 +122,9 @@ protected void doSample() throws Exception {
List<Long> tabletIds = pair.first;
double scaleFactor = (double) tbl.getRowCount() / (double) pair.second;
// might happen if row count in fe metadata hasn't been updated yet
if (Double.isInfinite(scaleFactor)) {
if (Double.isInfinite(scaleFactor) || Double.isNaN(scaleFactor)) {
scaleFactor = 1;
tabletIds = Collections.emptyList();
}
String tabletStr = tabletIds.stream()
.map(Object::toString)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ public static ResultRow mockResultRow(boolean col) {
add("8");
add("0");
add("10");
add("11");
// 11
add("MTE=");
add("12");
add(String.valueOf(System.currentTimeMillis()));
}};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ suite("test_mysql_jdbc_statistics", "p0,external,mysql,external_docker,external_
assertTrue(result[0][3] == "0.0")
assertTrue(result[0][4] == "15.0")
assertTrue(result[0][5] == "3.0")
assertTrue(result[0][6] == "'abc'")
assertTrue(result[0][7] == "'abg'")
assertEquals(result[0][6], "'abc'")
assertEquals(result[0][7], "'abg'")

result = sql """show column stats ex_tb0 (id)"""
assertTrue(result.size() == 1)
Expand Down
29 changes: 28 additions & 1 deletion regression-test/suites/statistics/analyze_stats.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ PARTITION `p599` VALUES IN (599)

sql """ANALYZE TABLE test_600_partition_table_analyze WITH SYNC"""

// column_name | count | ndv | num_null | data_size | avg_size_byte | min | max | updated_time
// 0:column_name | 1:count | 2:ndv | 3:num_null | 4:data_size | 5:avg_size_byte | 6:min | 7:max | 8:updated_time
id_col_stats = sql """
SHOW COLUMN CACHED STATS test_600_partition_table_analyze(id);
"""
Expand Down Expand Up @@ -1124,4 +1124,31 @@ PARTITION `p599` VALUES IN (599)
result = sql """SHOW COLUMN STATS test_analyze_specific_column"""
assert result.size() == 1

// test escape sql
sql """
DROP TABLE IF EXISTS test_max_min_lit;
"""

sql """
CREATE TABLE test_max_min_lit (
`col1` varchar(32) NULL
) ENGINE=OLAP
DUPLICATE KEY(`col1`)
COMMENT 'OLAP'
DISTRIBUTED BY HASH(`col1`) BUCKETS 3
PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
"""

sql """INSERT INTO test_max_min_lit VALUES("\\'")"""
sql """INSERT INTO test_max_min_lit VALUES('\\';')"""
sql "INSERT INTO test_max_min_lit VALUES('测试')"

sql """ANALYZE TABLE test_max_min_lit WITH SYNC"""
def max = sql """show column cached stats test_max_min_lit"""
def expected_max = { r, expected_value ->
return (r[0][7]).equals(expected_value)
}
expected_max(max, "测试")
}

0 comments on commit bd33a16

Please sign in to comment.