diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 5678a217b537f6..3d498ff5382518 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -119,4 +119,13 @@ CharFilterMap get_parser_char_filter_map_from_properties( return char_filter_map; } +std::string get_parser_ignore_above_value_from_properties( + const std::map& properties) { + if (properties.find(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY); + } else { + return INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE; + } +} + } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index bf931a3ce4773d..ca1efe773af558 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -69,6 +69,9 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement"; +const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above"; +const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256"; + std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str); @@ -82,4 +85,8 @@ std::string get_parser_phrase_support_string_from_properties( CharFilterMap get_parser_char_filter_map_from_properties( const std::map& properties); +// get parser ignore_above value from properties +std::string get_parser_ignore_above_value_from_properties( + const std::map& properties); + } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 0724559895746b..d397910891fb52 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -285,7 +285,16 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { } auto* v = (Slice*)values; for (int i = 0; i < count; ++i) { - new_fulltext_field(v->get_data(), v->get_size()); + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + auto ignore_above = std::stoi(ignore_above_value); + if (v->get_size() > ignore_above) { + VLOG_DEBUG << "fulltext index value length can be at most 256, but got " + << "value length:" << v->get_size() << ", ignore this value"; + new_fulltext_field(empty_value.c_str(), 0); + } else { + new_fulltext_field(v->get_data(), v->get_size()); + } RETURN_IF_ERROR(add_document()); ++v; _rid++; @@ -325,9 +334,18 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { } auto value = join(strings, " "); - new_fulltext_field(value.c_str(), value.length()); + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + auto ignore_above = std::stoi(ignore_above_value); + if (value.length() > ignore_above) { + VLOG_DEBUG << "fulltext index value length can be at most 256, but got " + << "value length:" << value.length() << ", ignore this value"; + new_fulltext_field(empty_value.c_str(), 0); + } else { + new_fulltext_field(value.c_str(), value.length()); + } _rid++; - _index_writer->addDocument(_doc.get()); + RETURN_IF_ERROR(add_document()); } } else if constexpr (field_is_numeric_type(field_type)) { for (int i = 0; i < count; ++i) { diff --git a/docs/en/docs/data-table/index/inverted-index.md b/docs/en/docs/data-table/index/inverted-index.md index f86d47c8bbe167..f10b543807c13a 100644 --- a/docs/en/docs/data-table/index/inverted-index.md +++ b/docs/en/docs/data-table/index/inverted-index.md @@ -89,6 +89,9 @@ The features for inverted index is as follows: - char_replace: replace each char in the pattern with a char in the replacement - char_filter_pattern: character array to be replaced - char_filter_replacement: replaced character array, can be left unset, defaults to a space character + - ignore_above: Controls whether strings are indexed. + - Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed. + - default value is 256 bytes. - COMMENT is optional ```sql diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md b/docs/zh-CN/docs/data-table/index/inverted-index.md index ad4c9a011d989e..e3cba26ed8f4ab 100644 --- a/docs/zh-CN/docs/data-table/index/inverted-index.md +++ b/docs/zh-CN/docs/data-table/index/inverted-index.md @@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下: - char_replace 将pattern中每个char替换为一个replacement中的char - char_filter_pattern:需要被替换掉的字符数组 - char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符 + - ignore_above:控制字符串是否建索引。 + - 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。 + - 默认为 256 字节 - COMMENT 是可选的,用于指定注释 ```sql diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index e6fcefb7e010b3..daeecede096aaa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -43,6 +43,8 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace"; + public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above"; + public static String getInvertedIndexParser(Map properties) { String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY); // default is "none" if not set @@ -98,6 +100,17 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c if (parser == null && !properties.isEmpty()) { throw new AnalysisException("invalid index properties, please check the properties"); } + String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE); + if (ignoreAbove != null) { + try { + int ignoreAboveValue = Integer.parseInt(ignoreAbove); + if (ignoreAboveValue <= 0) { + throw new AnalysisException("invalid index properties, ignore_above must be positive"); + } + } catch (NumberFormatException e) { + throw new AnalysisException("invalid index properties, ignore_above must be integer"); + } + } } // default is "none" if not set diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql index e15884d8c87c62..57164f24d643b2 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t1_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql index 4b921139046980..d4649382337ac7 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t1_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql index 2a8954609d52bf..8974b5de553491 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t2_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql index 733c398ccc9247..2761ae00c57444 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t2_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql index 03e3099aed4561..4be7d0bbaa8321 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t3_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql index e46c254da76f12..019a470786c1c2 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t3_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql index 5faf2da04bbb4b..ddf83cd7ae6c10 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t4_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql index b594d5cd3cb2a4..3f4df358c23e27 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t4_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3