Skip to content

Commit

Permalink
[pick][feature](inverted index) add ignore_above property to prevent …
Browse files Browse the repository at this point in the history
…long string from indexing #28585 #28819 (#29002)
  • Loading branch information
qidaye authored Dec 25, 2023
1 parent cae76a8 commit de95e76
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 4 deletions.
9 changes: 9 additions & 0 deletions be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,15 @@ CharFilterMap get_parser_char_filter_map_from_properties(
return char_filter_map;
}

std::string get_parser_ignore_above_value_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
} else {
return INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE;
}
}

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) {
Expand Down
7 changes: 7 additions & 0 deletions be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement";

const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above";
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";

const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);
Expand All @@ -84,6 +87,10 @@ std::string get_parser_phrase_support_string_from_properties(
CharFilterMap get_parser_char_filter_map_from_properties(
const std::map<std::string, std::string>& properties);

// get parser ignore_above value from properties
std::string get_parser_ignore_above_value_from_properties(
const std::map<std::string, std::string>& properties);

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties);
} // namespace doris
34 changes: 30 additions & 4 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,22 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
"field or index writer is null in inverted index writer");
}
auto* v = (Slice*)values;
auto ignore_above_value =
get_parser_ignore_above_value_from_properties(_index_meta->properties());
auto ignore_above = std::stoi(ignore_above_value);
for (int i = 0; i < count; ++i) {
new_fulltext_field(v->get_data(), v->get_size());
RETURN_IF_ERROR(add_document());
// only ignore_above UNTOKENIZED strings
if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
v->get_size() > ignore_above) {
VLOG_DEBUG << "fulltext index value length can be at most "
<< ignore_above_value << ", but got "
<< "value length:" << v->get_size() << ", ignore this value";
new_fulltext_field(empty_value.c_str(), 0);
RETURN_IF_ERROR(add_null_document());
} else {
new_fulltext_field(v->get_data(), v->get_size());
RETURN_IF_ERROR(add_document());
}
++v;
_rid++;
}
Expand All @@ -319,6 +332,9 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
return Status::InternalError(
"field or index writer is null in inverted index writer");
}
auto ignore_above_value =
get_parser_ignore_above_value_from_properties(_index_meta->properties());
auto ignore_above = std::stoi(ignore_above_value);
for (int i = 0; i < count; ++i) {
// offsets[i+1] is now row element count
std::vector<std::string> strings;
Expand All @@ -335,9 +351,19 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
}

auto value = join(strings, " ");
new_fulltext_field(value.c_str(), value.length());
// only ignore_above UNTOKENIZED strings
if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
value.length() > ignore_above) {
VLOG_DEBUG << "fulltext index value length can be at most "
<< ignore_above_value << ", but got "
<< "value length:" << value.length() << ", ignore this value";
new_fulltext_field(empty_value.c_str(), 0);
RETURN_IF_ERROR(add_null_document());
} else {
new_fulltext_field(value.c_str(), value.length());
RETURN_IF_ERROR(add_document());
}
_rid++;
_index_writer->addDocument(_doc.get());
}
} else if constexpr (field_is_numeric_type(field_type)) {
for (int i = 0; i < count; ++i) {
Expand Down
3 changes: 3 additions & 0 deletions docs/en/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ The features for inverted index is as follows:
- char_replace: replace each char in the pattern with a char in the replacement
- char_filter_pattern: character array to be replaced
- char_filter_replacement: replaced character array, can be left unset, defaults to a space character
- ignore_above: Controls whether strings are indexed.
- Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed.
- default value is 256 bytes.
- lower_case: Whether to convert tokens to lowercase, thereby achieving case-insensitive matching.
- true: Convert to lowercase
- false: Do not convert to lowercase
Expand Down
3 changes: 3 additions & 0 deletions docs/zh-CN/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下:
- char_replace 将pattern中每个char替换为一个replacement中的char
- char_filter_pattern:需要被替换掉的字符数组
- char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符
- ignore_above:控制字符串是否建索引。
- 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。
- 默认为 256 字节
- lower_case: 是否将分词进行小写转换,从而在匹配的时候实现忽略大小写
- true: 转换小写
- false:不转换小写
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ public class InvertedIndexUtil {

public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace";

public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above";

public static String INVERTED_INDEX_PARSER_LOWERCASE = "lower_case";

public static String getInvertedIndexParser(Map<String, String> properties) {
Expand Down Expand Up @@ -100,6 +102,17 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c
if (parser == null && !properties.isEmpty()) {
throw new AnalysisException("invalid index properties, please check the properties");
}
String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE);
if (ignoreAbove != null) {
try {
int ignoreAboveValue = Integer.parseInt(ignoreAbove);
if (ignoreAboveValue <= 0) {
throw new AnalysisException("invalid index properties, ignore_above must be positive");
}
} catch (NumberFormatException e) {
throw new AnalysisException("invalid index properties, ignore_above must be integer");
}
}
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE);
if (lowerCase != null) {
if (!"true".equals(lowerCase) && !"false".equals(lowerCase)) {
Expand Down

0 comments on commit de95e76

Please sign in to comment.