Skip to content

Commit

Permalink
[feature](inverted index) add ignore_above property to prevent long s… (
Browse files Browse the repository at this point in the history
#28585)

When string is too long, clucene will throw an error. 
And the string is too long to analyze. So we ignore the string in index process when the string is longer than 256 bytes by default.
We add an poperty `ignore_above` for user to customize.
  • Loading branch information
qidaye authored Dec 19, 2023
1 parent 73a3d84 commit 9c9249e
Show file tree
Hide file tree
Showing 14 changed files with 72 additions and 19 deletions.
9 changes: 9 additions & 0 deletions be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,13 @@ CharFilterMap get_parser_char_filter_map_from_properties(
return char_filter_map;
}

std::string get_parser_ignore_above_value_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
} else {
return INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE;
}
}

} // namespace doris
7 changes: 7 additions & 0 deletions be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement";

const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above";
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";

std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);

InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str);
Expand All @@ -82,4 +85,8 @@ std::string get_parser_phrase_support_string_from_properties(
CharFilterMap get_parser_char_filter_map_from_properties(
const std::map<std::string, std::string>& properties);

// get parser ignore_above value from properties
std::string get_parser_ignore_above_value_from_properties(
const std::map<std::string, std::string>& properties);

} // namespace doris
24 changes: 21 additions & 3 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,16 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
}
auto* v = (Slice*)values;
for (int i = 0; i < count; ++i) {
new_fulltext_field(v->get_data(), v->get_size());
auto ignore_above_value =
get_parser_ignore_above_value_from_properties(_index_meta->properties());
auto ignore_above = std::stoi(ignore_above_value);
if (v->get_size() > ignore_above) {
VLOG_DEBUG << "fulltext index value length can be at most 256, but got "
<< "value length:" << v->get_size() << ", ignore this value";
new_fulltext_field(empty_value.c_str(), 0);
} else {
new_fulltext_field(v->get_data(), v->get_size());
}
RETURN_IF_ERROR(add_document());
++v;
_rid++;
Expand Down Expand Up @@ -325,9 +334,18 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
}

auto value = join(strings, " ");
new_fulltext_field(value.c_str(), value.length());
auto ignore_above_value =
get_parser_ignore_above_value_from_properties(_index_meta->properties());
auto ignore_above = std::stoi(ignore_above_value);
if (value.length() > ignore_above) {
VLOG_DEBUG << "fulltext index value length can be at most 256, but got "
<< "value length:" << value.length() << ", ignore this value";
new_fulltext_field(empty_value.c_str(), 0);
} else {
new_fulltext_field(value.c_str(), value.length());
}
_rid++;
_index_writer->addDocument(_doc.get());
RETURN_IF_ERROR(add_document());
}
} else if constexpr (field_is_numeric_type(field_type)) {
for (int i = 0; i < count; ++i) {
Expand Down
3 changes: 3 additions & 0 deletions docs/en/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ The features for inverted index is as follows:
- char_replace: replace each char in the pattern with a char in the replacement
- char_filter_pattern: character array to be replaced
- char_filter_replacement: replaced character array, can be left unset, defaults to a space character
- ignore_above: Controls whether strings are indexed.
- Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed.
- default value is 256 bytes.
- COMMENT is optional

```sql
Expand Down
3 changes: 3 additions & 0 deletions docs/zh-CN/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下:
- char_replace 将pattern中每个char替换为一个replacement中的char
- char_filter_pattern:需要被替换掉的字符数组
- char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符
- ignore_above:控制字符串是否建索引。
- 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。
- 默认为 256 字节
- COMMENT 是可选的,用于指定注释

```sql
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ public class InvertedIndexUtil {

public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace";

public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above";

public static String getInvertedIndexParser(Map<String, String> properties) {
String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
Expand Down Expand Up @@ -98,6 +100,17 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c
if (parser == null && !properties.isEmpty()) {
throw new AnalysisException("invalid index properties, please check the properties");
}
String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE);
if (ignoreAbove != null) {
try {
int ignoreAboveValue = Integer.parseInt(ignoreAbove);
if (ignoreAboveValue <= 0) {
throw new AnalysisException("invalid index properties, ignore_above must be positive");
}
} catch (NumberFormatException e) {
throw new AnalysisException("invalid index properties, ignore_above must be integer");
}
}
}

// default is "none" if not set
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t1_dk (
FTS_DOC_ID BIGINT NOT NULL,
a TEXT,
b TEXT,
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx'
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx'
)
DUPLICATE KEY(FTS_DOC_ID)
DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t1_uk (
FTS_DOC_ID BIGINT NOT NULL,
a TEXT,
b TEXT,
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx'
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx'
)
UNIQUE KEY(FTS_DOC_ID)
DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t2_dk (
FTS_DOC_ID BIGINT NOT NULL,
a TEXT,
b TEXT,
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx'
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx'
)
DUPLICATE KEY(FTS_DOC_ID)
DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t2_uk (
FTS_DOC_ID BIGINT NOT NULL,
a TEXT,
b TEXT,
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx'
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx'
)
UNIQUE KEY(FTS_DOC_ID)
DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t3_dk (
FTS_DOC_ID BIGINT NOT NULL,
a TEXT,
b TEXT,
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx'
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx'
)
DUPLICATE KEY(FTS_DOC_ID)
DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t3_uk (
FTS_DOC_ID BIGINT NOT NULL,
a TEXT,
b TEXT,
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx'
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx'
)
UNIQUE KEY(FTS_DOC_ID)
DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t4_dk (
FTS_DOC_ID BIGINT NOT NULL,
a TEXT,
b TEXT,
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx'
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx'
)
DUPLICATE KEY(FTS_DOC_ID)
DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t4_uk (
FTS_DOC_ID BIGINT NOT NULL,
a TEXT,
b TEXT,
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx'
INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx',
INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx'
)
UNIQUE KEY(FTS_DOC_ID)
DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3
Expand Down

0 comments on commit 9c9249e

Please sign in to comment.