Skip to content

Commit

Permalink
add support for regexp
Browse files Browse the repository at this point in the history
  • Loading branch information
windtalker committed Feb 8, 2022
1 parent 89e672c commit 1ac1a86
Show file tree
Hide file tree
Showing 14 changed files with 2,723 additions and 113 deletions.
3 changes: 2 additions & 1 deletion dbms/src/Common/OptimizedRegularExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ class OptimizedRegularExpressionImpl
{
RE_CASELESS = 0x00000001,
RE_NO_CAPTURE = 0x00000010,
RE_DOT_NL = 0x00000100
RE_DOT_NL = 0x00000100,
RE_NO_OPTIMIZE = 0x00001000
};

using Match = OptimizedRegularExpressionDetails::Match;
Expand Down
17 changes: 14 additions & 3 deletions dbms/src/Common/OptimizedRegularExpression.inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,21 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
template <bool thread_safe>
OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
{
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
if (options & RE_NO_OPTIMIZE)
{
/// query from TiDB, currently, since analyze does not handle all the cases, skip the optimization
/// to avoid im-compatible issues
is_trivial = false;
required_substring.clear();
required_substring_is_prefix = false;
}
else
{
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
}

/// Just three following options are supported
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
/// Just four following options are supported
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL | RE_NO_OPTIMIZE)))
throw Poco::Exception("OptimizedRegularExpression: Unsupported option.");

is_case_insensitive = options & RE_CASELESS;
Expand Down
28 changes: 27 additions & 1 deletion dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,30 @@ String DAGExpressionAnalyzerHelper::buildRoundFunction(
return analyzer->applyFunction("tidbRoundWithFrac", argument_names, actions, getCollatorFromExpr(expr));
}

String DAGExpressionAnalyzerHelper::buildRegexpFunction(
DAGExpressionAnalyzer * analyzer,
const tipb::Expr & expr,
ExpressionActionsPtr & actions)
{
const String & func_name = getFunctionName(expr);
Names argument_names;
for (const auto & child : expr.children())
{
String name = analyzer->getActions(child, actions);
argument_names.push_back(name);
}
std::shared_ptr<TiDB::ITiDBCollator> collator = getCollatorFromExpr(expr);
if (expr.sig() == tipb::ScalarFuncSig::RegexpReplaceSig || expr.sig() == tipb::ScalarFuncSig::RegexpSig)
{
/// according to https://github.com/pingcap/tidb/blob/v5.0.0/expression/builtin_like.go#L126,
/// For binary collation, it will use RegexpXXXSig, otherwise it will use RegexpXXXUTF8Sig
/// Need to set the collator explicitly because `getCollatorFromExpr` will return nullptr
/// if new collation is not enabled.
collator = TiDB::ITiDBCollator::getCollator(TiDB::ITiDBCollator::BINARY);
}
return analyzer->applyFunction(func_name, argument_names, actions, collator);
}

DAGExpressionAnalyzerHelper::FunctionBuilderMap DAGExpressionAnalyzerHelper::function_builder_map(
{{"in", DAGExpressionAnalyzerHelper::buildInFunction},
{"notIn", DAGExpressionAnalyzerHelper::buildInFunction},
Expand All @@ -399,6 +423,8 @@ DAGExpressionAnalyzerHelper::FunctionBuilderMap DAGExpressionAnalyzerHelper::fun
{"leftUTF8", DAGExpressionAnalyzerHelper::buildLeftUTF8Function},
{"date_add", DAGExpressionAnalyzerHelper::buildDateAddOrSubFunction<DateAdd>},
{"date_sub", DAGExpressionAnalyzerHelper::buildDateAddOrSubFunction<DateSub>},
{"regexp", DAGExpressionAnalyzerHelper::buildRegexpFunction},
{"replaceRegexpAll", DAGExpressionAnalyzerHelper::buildRegexpFunction},
{"tidbRound", DAGExpressionAnalyzerHelper::buildRoundFunction}});

} // namespace DB
} // namespace DB
7 changes: 6 additions & 1 deletion dbms/src/Flash/Coprocessor/DAGExpressionAnalyzerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ class DAGExpressionAnalyzerHelper
const tipb::Expr & expr,
ExpressionActionsPtr & actions);

static String buildRegexpFunction(
DAGExpressionAnalyzer * analyzer,
const tipb::Expr & expr,
ExpressionActionsPtr & actions);

static String genFuncString(
const String & func_name,
const Names & argument_names,
Expand All @@ -74,4 +79,4 @@ class DAGExpressionAnalyzerHelper

static FunctionBuilderMap function_builder_map;
};
} // namespace DB
} // namespace DB
4 changes: 2 additions & 2 deletions dbms/src/Flash/Coprocessor/DAGUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,8 @@ const std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
//{tipb::ScalarFuncSig::UUID, "cast"},

{tipb::ScalarFuncSig::LikeSig, "like3Args"},
//{tipb::ScalarFuncSig::RegexpSig, "cast"},
//{tipb::ScalarFuncSig::RegexpUTF8Sig, "cast"},
{tipb::ScalarFuncSig::RegexpSig, "regexp"},
{tipb::ScalarFuncSig::RegexpUTF8Sig, "regexp"},

//{tipb::ScalarFuncSig::JsonExtractSig, "cast"},
//{tipb::ScalarFuncSig::JsonUnquoteSig, "cast"},
Expand Down
3 changes: 2 additions & 1 deletion dbms/src/Functions/FunctionsStringArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,8 @@ class ExtractAllImpl
+ " of first argument of function " + getName() + ". Must be constant string.",
ErrorCodes::ILLEGAL_COLUMN);

re = Regexps::get<false, false>(col->getValue<String>());
int flags = OptimizedRegularExpression::RE_DOT_NL;
re = Regexps::get<false, false>(col->getValue<String>(), flags);
capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0;

matches.resize(capture + 1);
Expand Down
Loading

0 comments on commit 1ac1a86

Please sign in to comment.