From 287e8be2a4d0ede8b4d54297e4052ec47ff477be Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 24 Apr 2024 00:41:32 +0100 Subject: [PATCH 1/8] Initial support for capture in regex Allow for a capturing regular expression to be passed with a filter function that will filter based on the captured output. Also allows a specific captured term to be returned as the returned term (rather than the whole index term) --- src/leveled_bookie.erl | 2 +- src/leveled_codec.erl | 45 ++++++- src/leveled_runner.erl | 13 +- src/leveled_util.erl | 34 +++-- test/end_to_end/iterator_SUITE.erl | 206 ++++++++++++++++++++++++++++- test/end_to_end/perf_SUITE.erl | 1 + 6 files changed, 275 insertions(+), 26 deletions(-) diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 34a2435f..ade345ca 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -696,7 +696,7 @@ book_returnfolder(Pid, RunnerType) -> IndexVal::term(), Start::IndexVal, End::IndexVal, - ReturnTerms::boolean(), + ReturnTerms::boolean()|binary(), TermRegex :: leveled_codec:regular_expression(). book_indexfold(Pid, Constraint, FoldAccT, Range, TermHandling) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index fd1ef627..1f655fde 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -112,10 +112,13 @@ {index_specs(), infinity|integer()}. % {KeyChanges, TTL} -type maybe_lookup() :: lookup|no_lookup. +-type actual_regex() :: + reference()|{re_pattern, term(), term(), term(), term()}. +-type capture_filter_fun() :: fun((list({binary(), binary()})) -> boolean()). +-type capture_reference() :: + {capture, actual_regex(), list(binary()), capture_filter_fun()}. -type regular_expression() :: - {re_pattern, term(), term(), term(), term()}|undefined. - % first element must be re_pattern, but tuple may change legnth with - % versions + actual_regex()|undefined|capture_reference(). -type value_fetcher() :: {fun((pid(), leveled_codec:journal_key()) -> any()), @@ -155,6 +158,7 @@ last_moddate/0, lastmod_range/0, regular_expression/0, + actual_regex/0, value_fetcher/0, proxy_object/0, slimmed_key/0 @@ -292,8 +296,9 @@ maybe_accumulate( maybe_accumulate(T, Acc, Count, Filter, AccFun). -spec accumulate_index( - {boolean(), undefined|leveled_runner:mp()}, leveled_runner:acc_fun()) - -> any(). + {boolean()|binary(), + regular_expression()}, + leveled_runner:acc_fun()) -> any(). accumulate_index({false, undefined}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, _IndexInfo, ObjKey}, _Value, Acc) -> FoldKeysFun(Bucket, ObjKey, Acc) @@ -302,9 +307,36 @@ accumulate_index({true, undefined}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc) end; +accumulate_index( + {AddTerm, {capture, TermRegex, CptKeys, FilterFun}}, FoldKeysFun) -> + ExpectedKeys = length(CptKeys), + Opts = [{capture, all_but_first, binary}], + fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> + case leveled_util:regex_run(IdxValue, TermRegex, Opts) of + {match, CptTerms} when length(CptTerms) == ExpectedKeys -> + CptPairs = lists:zip(CptKeys, CptTerms), + case FilterFun(CptPairs) of + true -> + case AddTerm of + true -> + FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc); + false -> + FoldKeysFun(Bucket, ObjKey, Acc); + CptKey when is_binary(CptKey) -> + {CptKey, CptValue} = + lists:keyfind(CptKey, 1, CptPairs), + FoldKeysFun(Bucket, {CptValue, ObjKey}, Acc) + end; + false -> + Acc + end; + _ -> + Acc + end + end; accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> - case leveled_util:regex_run(IdxValue, TermRegex) of + case leveled_util:regex_run(IdxValue, TermRegex, []) of nomatch -> Acc; _ -> @@ -317,6 +349,7 @@ accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> end end. + -spec key_dominates(ledger_kv(), ledger_kv()) -> boolean(). %% @doc %% When comparing two keys in the ledger need to find if one key comes before diff --git a/src/leveled_runner.erl b/src/leveled_runner.erl index 4b221b09..15704b20 100644 --- a/src/leveled_runner.erl +++ b/src/leveled_runner.erl @@ -130,12 +130,11 @@ bucket_list(SnapFun, Tag, FoldBucketsFun, InitAcc, MaxBuckets) -> end, {async, Runner}. --spec index_query(snap_fun(), - {leveled_codec:ledger_key(), - leveled_codec:ledger_key(), - {boolean(), undefined|mp()|iodata()}}, - {fold_keys_fun(), foldacc()}) - -> {async, runner_fun()}. +-spec index_query( + snap_fun(), + {leveled_codec:ledger_key(), leveled_codec:ledger_key(), + {boolean()|binary(), leveled_codec:regular_expression()}}, + {fold_keys_fun(), foldacc()}) -> {async, runner_fun()}. %% @doc %% Secondary index query %% This has the special capability that it will expect a message to be thrown @@ -681,7 +680,7 @@ accumulate_keys(FoldKeysFun, undefined) -> accumulate_keys(FoldKeysFun, TermRegex) -> fun(Key, _Value, Acc) -> {B, K} = leveled_codec:from_ledgerkey(Key), - case leveled_util:regex_run(K, TermRegex) of + case leveled_util:regex_run(K, TermRegex, []) of nomatch -> Acc; _ -> diff --git a/src/leveled_util.erl b/src/leveled_util.erl index 0817b32b..9f82fdac 100644 --- a/src/leveled_util.erl +++ b/src/leveled_util.erl @@ -11,7 +11,7 @@ magic_hash/1, t2b/1, safe_rename/4, - regex_run/2, + regex_run/3, regex_compile/1 ]). @@ -42,15 +42,31 @@ integer_time(TS) -> DT = calendar:now_to_universal_time(TS), calendar:datetime_to_gregorian_seconds(DT). + +-type match_option() :: + 'caseless' | + {'offset', non_neg_integer()} | + {'capture', value_spec()} | + {'capture', value_spec(), value_spec_type()}. +-type value_spec() :: + 'all' | 'all_but_first' | 'first' | 'none' | [value_id()]. +-type value_spec_type() :: 'binary'. +-type value_id() :: binary(). +-type match_index() :: {non_neg_integer(), non_neg_integer()}. + -spec regex_run( - iodata(), any()) -> - match | nomatch | {match, list()} | {error, atom()}. -regex_run(Subject, CompiledRE2) when is_reference(CompiledRE2) -> - re2:run(Subject, CompiledRE2); -regex_run(Subject, CompiledPCRE) -> - re:run(Subject, CompiledPCRE). - --spec regex_compile(iodata()) -> reference(). + iodata(), leveled_codec:actual_regex(), list(match_option())) -> + match | + nomatch | + {match, list(match_index())} | + {match, list(binary())} | + {error, atom()}. +regex_run(Subject, CompiledRE2, Opts) when is_reference(CompiledRE2) -> + re2:run(Subject, CompiledRE2, Opts); +regex_run(Subject, CompiledPCRE, Opts) -> + re:run(Subject, CompiledPCRE, Opts). + +-spec regex_compile(iodata()) -> {ok, reference()}. regex_compile(PlainRegex) -> re2:compile(PlainRegex). diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index a72f36e9..899601ca 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -13,7 +13,9 @@ query_count/1, multibucket_fold/1, foldobjects_bybucket_range/1, - rotating_objects/1]). + rotating_objects/1, + capture_and_filter_terms/1 + ]). all() -> [ expiring_indexes, @@ -23,7 +25,8 @@ all() -> [ query_count, multibucket_fold, rotating_objects, - foldobjects_bybucket_range + foldobjects_bybucket_range, + capture_and_filter_terms ]. @@ -677,7 +680,7 @@ query_count(_Config) -> Mia2000Count2 = lists:foldl( fun({Term, _Key}, Acc) -> - case leveled_util:regex_run(Term, RegMia) of + case leveled_util:regex_run(Term, RegMia, []) of nomatch -> Acc; _ -> @@ -840,6 +843,203 @@ query_count(_Config) -> testutil:reset_filestructure(). +capture_and_filter_terms(_Config) -> + RootPath = testutil:reset_filestructure(), + Bucket = {<<"Type1">>, <<"Bucket1">>}, + IdxName = <<"people_bin">>, + {ok, Book1} = + leveled_bookie:book_start( + RootPath, 2000, 50000000, testutil:sync_strategy()), + ObjectGen = testutil:get_compressiblevalue_andinteger(), + IndexGen = + fun() -> + [{add, IdxName, list_to_binary(perf_SUITE:random_people_index())}] + end, + ObjL1 = + testutil:generate_objects( + 80000, uuid, [], ObjectGen, IndexGen, Bucket), + testutil:riakload(Book1, ObjL1), + + StartDoB = <<"19740301">>, + EndDoB = <<"19761031">>, + + WillowLeedsFinder = + "[^\\|]*\\|[0-9]{8}\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" + "[^\\|]*#LS[^\\|]*", + + SW0 = os:timestamp(), + {ok, WillowLeedsPCRE} = re:compile(WillowLeedsFinder), + QueryPCRE0 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {true, WillowLeedsPCRE}}, + {async, Runner0} = leveled_bookie:book_returnfolder(Book1, QueryPCRE0), + Results0 = Runner0(), + BornMid70s0 = + lists:filtermap( + fun({IdxValue, Key}) -> + DoB = + list_to_binary( + lists:nth( + 2, + string:tokens(binary_to_list(IdxValue), "|") + )), + case (DoB >= StartDoB) andalso (DoB =< EndDoB) of + true -> + {true, Key}; + false -> + false + end + end, + Results0 + ), + + SW1 = os:timestamp(), + + WillowLeedsExtractor = + "[^\\|]*\\|(?P[0-9]{8})\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" + "[^\\|]*#LS[^\\|]*", + FilterFun1 = + fun([{<<"dob">>, DoB}]) -> + (DoB >= StartDoB) andalso (DoB =< EndDoB) + end, + {ok, WillowLeedsExtractorPCRE} = re:compile(WillowLeedsExtractor), + QueryPCRE1 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, + {capture, WillowLeedsExtractorPCRE, [<<"dob">>], FilterFun1}} + }, + {async, RunnerPCRE1} = leveled_bookie:book_returnfolder(Book1, QueryPCRE1), + BornMid70sPCRE1 = RunnerPCRE1(), + + SW2 = os:timestamp(), + + {ok, WillowLeedsExtractorRE2} = re2:compile(WillowLeedsExtractor), + QueryRE2_2 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, + {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}} + }, + {async, RunnerRE2_2} = leveled_bookie:book_returnfolder(Book1, QueryRE2_2), + BornMid70sRE2_2 = RunnerRE2_2(), + + SW3 = os:timestamp(), + + QueryRE2_3 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {<<"dob">>, + {capture, WillowLeedsExtractorRE2, [<<"dob">>], fun(_) -> true end}} + }, + {async, RunnerRE2_3} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3), + Results3 = RunnerRE2_3(), + BornMid70sRE2_3 = + lists:filtermap( + fun({DoB, Key}) -> + case (DoB >= StartDoB) andalso (DoB =< EndDoB) of + true -> + {true, Key}; + false -> + false + end + end, + Results3 + ), + + SW4 = os:timestamp(), + + WillowLeedsDoubleExtractor = + "[^\\|]*\\|(?P[0-9]{8})\\|(?P[0-9]{0,8})\\|" + "[^\\|]*#Willow[^\\|]*\\|[^\\|]*#LS[^\\|]*", + + FilterFun2 = + fun(Captures) -> + {<<"dob">>, DoB} = hd(Captures), + (DoB >= StartDoB) andalso (DoB =< EndDoB) + end, + QueryRE2_4 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, + {capture, + element(2, re2:compile(WillowLeedsDoubleExtractor)), + [<<"dob">>, <<"dod">>], + FilterFun2}} + }, + {async, RunnerRE2_4} = leveled_bookie:book_returnfolder(Book1, QueryRE2_4), + BornMid70sRE2_4 = RunnerRE2_4(), + + SW5 = os:timestamp(), + + QueryRE2_5 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {true, + {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}} + }, + {async, RunnerRE2_5} = leveled_bookie:book_returnfolder(Book1, QueryRE2_5), + BornMid70sRE2_5 = + lists:filtermap( + fun({T, K}) -> + {match, _} = + leveled_util:regex_run(T, WillowLeedsExtractorRE2, []), + {true, K} + end, + RunnerRE2_5()), + + true = length(BornMid70s0) > 0, + true = length(BornMid70sPCRE1) > 0, + true = length(BornMid70sRE2_2) > 0, + true = length(BornMid70sRE2_3) > 0, + true = length(BornMid70sRE2_4) > 0, + true = length(BornMid70sRE2_5) > 0, + + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sPCRE1), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_2), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_3), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_4), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_5), + + io:format( + % user, + "~nFilter outside took ~w ms~n", + [timer:now_diff(SW1, SW0) div 1000]), + io:format( + % user, + "~nPCRE Capture filter inside took ~w ms~n", + [timer:now_diff(SW2, SW1) div 1000]), + io:format( + % user, + "~nRE2 Capture filter inside took ~w ms~n", + [timer:now_diff(SW3, SW2) div 1000]), + io:format( + % user, + "~nRE2 Capture filter outside took ~w ms~n", + [timer:now_diff(SW4, SW3) div 1000]), + io:format( + % user, + "~nRE2 double-capture filter outside took ~w ms~n", + [timer:now_diff(SW5, SW4) div 1000]), + + ok = leveled_bookie:book_close(Book1), + + testutil:reset_filestructure(). + + count_termsonindex(Bucket, IdxField, Book, QType) -> lists:foldl( fun(X, Acc) -> diff --git a/test/end_to_end/perf_SUITE.erl b/test/end_to_end/perf_SUITE.erl index 93f78f10..27549f54 100644 --- a/test/end_to_end/perf_SUITE.erl +++ b/test/end_to_end/perf_SUITE.erl @@ -5,6 +5,7 @@ -export([ riak_ctperf/1, riak_fullperf/1, riak_profileperf/1, riak_miniperf/1 ]). +-export([random_people_index/0]). -define(PEOPLE_INDEX, <<"people_bin">>). -define(MINI_QUERY_DIVISOR, 8). From f54d75c72c51ac2f0aba1ca063d176c22f1f2f24 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 24 Apr 2024 13:37:19 +0100 Subject: [PATCH 2/8] Create leveled_filter.erl First draft of allowing an externally provided comparison expression to be validated, parsed and evaluated --- src/leveled_filter.erl | 174 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 src/leveled_filter.erl diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl new file mode 100644 index 00000000..3d9d04b2 --- /dev/null +++ b/src/leveled_filter.erl @@ -0,0 +1,174 @@ +%% -------- Filter Functions --------- +%% +%% Support for different filter expressions within leveled +%% + +-module(leveled_filter). + +-export([validate_comparison_expression/2]). + +%%%============================================================================ +%%% External API +%%%============================================================================ + +%% @doc validate a comaprison expression +%% A comparison expression allows for string comparisons to be made using +%% >, <, >=, =< with andalso, or as well as () to group different expressions +%% The output is a function which will be passed a Map where the Map may be the +%% result of reading some projected attributes on an index string. +%% To compare a string with a key in the map, the name of the key must begin +%% with a "$". +%% In creating the function, the potential keys must be known and passed as a +%% string in the list (without the leading $). +-spec validate_comparison_expression( + string(), list(string())) -> + fun((map()) -> {value, boolean(), map()}) | + {error, string(), erl_anno:location()}. +validate_comparison_expression( + Expression, AvailableArgs) when is_list(Expression) -> + case erl_scan:string(Expression) of + {ok, Tokens, _EndLoc} -> + case vert_compexpr_tokens(Tokens, AvailableArgs, []) of + {ok, UpdTokens} -> + case erl_parse:parse_exprs(UpdTokens) of + {ok, [Form]} -> + io:format("Form ~p~n", [Form]), + fun(LookupMap) -> + erl_eval:expr(Form, LookupMap) + end; + {error, ErrorInfo} -> + {error, ErrorInfo, undefined} + end; + {error, Error, ErrorLocation} -> + {error, lists:flatten(Error), ErrorLocation} + end; + {error, Error, ErrorLocation} -> + {error, Error, ErrorLocation} + end; +validate_comparison_expression(_Expression, _AvailableArgs) -> + {error, "Expression not a valid string", undefined}. + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +vert_compexpr_tokens([], _Args, ParsedTokens) -> + {ok, lists:reverse(ParsedTokens)}; +vert_compexpr_tokens([{dot, LN}], Args, ParsedTokens) -> + vert_compexpr_tokens([], Args, [{dot, LN}|ParsedTokens]); +vert_compexpr_tokens(L, _Args, _ParsedTokens) when length(L) == 1 -> + {error, "Query does not end with `.`", undefined}; +vert_compexpr_tokens([{string, LN, String}|Rest], Args, ParsedTokens) -> + case hd(String) of + 36 -> % 36 == $ + Arg = tl(String), + case lists:member(Arg, Args) of + true -> + VarToken = {var, LN, list_to_atom(Arg)}, + vert_compexpr_tokens(Rest, Args, [VarToken|ParsedTokens]); + false -> + {error, io_lib:format("Unavailable Arg ~s", [Arg]), LN} + end; + _ -> % Not a key, treat as a string + vert_compexpr_tokens( + Rest, Args, [{string, LN, String}|ParsedTokens]) + end; +vert_compexpr_tokens([{'(',LN}|Rest], Args, ParsedTokens) -> + vert_compexpr_tokens(Rest, Args, [{'(',LN}|ParsedTokens]); +vert_compexpr_tokens([{')',LN}|Rest], Args, ParsedTokens) -> + vert_compexpr_tokens(Rest, Args, [{')',LN}|ParsedTokens]); +vert_compexpr_tokens([{'andalso', LN}|Rest], Args, ParsedTokens) -> + vert_compexpr_tokens(Rest, Args, [{'andalso', LN}|ParsedTokens]); +vert_compexpr_tokens([{'or', LN}|Rest], Args, ParsedTokens) -> + vert_compexpr_tokens(Rest, Args, [{'or', LN}|ParsedTokens]); +vert_compexpr_tokens([{'>', LN}|Rest], Args, ParsedTokens) -> + vert_compexpr_tokens(Rest, Args, [{'>', LN}|ParsedTokens]); +vert_compexpr_tokens([{'<', LN}|Rest], Args, ParsedTokens) -> + vert_compexpr_tokens(Rest, Args, [{'<', LN}|ParsedTokens]); +vert_compexpr_tokens([{'>=', LN}|Rest], Args, ParsedTokens) -> + vert_compexpr_tokens(Rest, Args, [{'>=', LN}|ParsedTokens]); +vert_compexpr_tokens([{'=<', LN}|Rest], Args, ParsedTokens) -> + vert_compexpr_tokens(Rest, Args, [{'=<', LN}|ParsedTokens]); +vert_compexpr_tokens([InvalidToken|_Rest], _Args, _ParsedTokens) -> + {error, io_lib:format("Invalid token ~w", [InvalidToken]), undefined}. + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + +good_querystring_test() -> + QueryString = + "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" + " or (\"$dod\" > \"20200101\" andalso \"$dod\" < \"20230101\").", + AvailableArgs = ["dob", "dod"], + F = validate_comparison_expression(QueryString, AvailableArgs), + M1 = maps:from_list([{dob, "19750202"}, {dod, "20221216"}]), + M2 = maps:from_list([{dob, "19750202"}, {dod, "20191216"}]), + M3 = maps:from_list([{dob, "19790202"}, {dod, "20221216"}]), + M4 = maps:from_list([{dob, "19790202"}, {dod, "20191216"}]), + M5 = maps:from_list([{dob, "19790202"}, {dod, "20241216"}]), + ?assertMatch(true, element(2, F(M1))), + ?assertMatch(true, element(2, F(M2))), + ?assertMatch(true, element(2, F(M3))), + ?assertMatch(false, element(2, F(M4))), + ?assertMatch(false, element(2, F(M5))). + +bad_querystring_test() -> + QueryString = + "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" + " or (\"$dod\" > \"20200101\").", + BadString1 = + "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" + " or (\"$dod\" > \"20200101\")", + BadString2 = + "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" + " or (\"$dod\" > \"20200101\")).", + BadString3 = + "(\"$dob\" >= \"19740301\" and \"$dob\" =< \"19761030\")" + " or (\"$dod\" > \"20200101\").", + BadString4 = "os:cmd(foo)", + BadString5 = + [42,63,52,10,240,159,140,190] ++ + "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" + " or (\"$dod\" > \"20200101\").", + BadString6 = [(16#110000 + 1)|QueryString], + BadString7 = <<42:32/integer>>, + ?assertMatch( + {error, "Unavailable Arg dod", 1}, + validate_comparison_expression(QueryString, ["dob", "pv"]) + ), + ?assertMatch( + {error, "Query does not end with `.`", undefined}, + validate_comparison_expression(BadString1, ["dob", "dod"]) + ), + ?assertMatch( + {error, {1 ,erl_parse, ["syntax error before: ","')'"]}, undefined}, + validate_comparison_expression(BadString2, ["dob", "dod"]) + ), + ?assertMatch( + {error, "Invalid token {'and',1}", undefined}, + validate_comparison_expression(BadString3, ["dob", "dod"]) + ), + ?assertMatch( + {error, "Invalid token {atom,1,os}", undefined}, + validate_comparison_expression(BadString4, ["dob", "dod"]) + ), + ?assertMatch( + {error, "Invalid token {'*',1}", undefined}, + validate_comparison_expression(BadString5, ["dob", "dod"]) + ), + ?assertMatch( + {error, {1, erl_scan, {illegal,character}}, 1}, + validate_comparison_expression(BadString6, ["dob", "dod"]) + ), + ?assertMatch( + {error, "Expression not a valid string", undefined}, + validate_comparison_expression(BadString7, ["dob", "dod"]) + ). + +-endif. From 4a4a7ca9542a2717105d720ff9c6d13031b782fa Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 24 Apr 2024 17:44:45 +0100 Subject: [PATCH 3/8] Make binary comparisons in leveled_filter Required as re2 can only output binary not string in captures. --- src/leveled_codec.erl | 13 ++- src/leveled_filter.erl | 51 ++++++------ test/end_to_end/iterator_SUITE.erl | 123 +++++++++++++++++++++++------ 3 files changed, 132 insertions(+), 55 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 1f655fde..7426fb23 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -114,9 +114,9 @@ lookup|no_lookup. -type actual_regex() :: reference()|{re_pattern, term(), term(), term(), term()}. --type capture_filter_fun() :: fun((list({binary(), binary()})) -> boolean()). +-type capture_filter_fun() :: fun((#{atom() => string()}) -> boolean()). -type capture_reference() :: - {capture, actual_regex(), list(binary()), capture_filter_fun()}. + {capture, actual_regex(), list(atom()), capture_filter_fun()}. -type regular_expression() :: actual_regex()|undefined|capture_reference(). @@ -314,17 +314,16 @@ accumulate_index( fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> case leveled_util:regex_run(IdxValue, TermRegex, Opts) of {match, CptTerms} when length(CptTerms) == ExpectedKeys -> - CptPairs = lists:zip(CptKeys, CptTerms), - case FilterFun(CptPairs) of + CptMap = maps:from_list(lists:zip(CptKeys, CptTerms)), + case FilterFun(CptMap) of true -> case AddTerm of true -> FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc); false -> FoldKeysFun(Bucket, ObjKey, Acc); - CptKey when is_binary(CptKey) -> - {CptKey, CptValue} = - lists:keyfind(CptKey, 1, CptPairs), + CptKey when is_atom(CptKey) -> + CptValue = maps:get(CptKey, CptMap), FoldKeysFun(Bucket, {CptValue, ObjKey}, Acc) end; false -> diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl index 3d9d04b2..a273b128 100644 --- a/src/leveled_filter.erl +++ b/src/leveled_filter.erl @@ -11,9 +11,10 @@ %%% External API %%%============================================================================ -%% @doc validate a comaprison expression +%% @doc validate a comparison expression %% A comparison expression allows for string comparisons to be made using -%% >, <, >=, =< with andalso, or as well as () to group different expressions +%% >, <, >=, =< with andalso, orelse as well as () to group different +%% expressions. %% The output is a function which will be passed a Map where the Map may be the %% result of reading some projected attributes on an index string. %% To compare a string with a key in the map, the name of the key must begin @@ -32,9 +33,8 @@ validate_comparison_expression( {ok, UpdTokens} -> case erl_parse:parse_exprs(UpdTokens) of {ok, [Form]} -> - io:format("Form ~p~n", [Form]), fun(LookupMap) -> - erl_eval:expr(Form, LookupMap) + element(2, erl_eval:expr(Form, LookupMap)) end; {error, ErrorInfo} -> {error, ErrorInfo, undefined} @@ -69,9 +69,10 @@ vert_compexpr_tokens([{string, LN, String}|Rest], Args, ParsedTokens) -> false -> {error, io_lib:format("Unavailable Arg ~s", [Arg]), LN} end; - _ -> % Not a key, treat as a string + _ -> % Not a key, treat as a binary + BinString = [{'>>', LN}, {string, LN, String}, {'<<', LN}], vert_compexpr_tokens( - Rest, Args, [{string, LN, String}|ParsedTokens]) + Rest, Args, BinString ++ ParsedTokens) end; vert_compexpr_tokens([{'(',LN}|Rest], Args, ParsedTokens) -> vert_compexpr_tokens(Rest, Args, [{'(',LN}|ParsedTokens]); @@ -79,8 +80,8 @@ vert_compexpr_tokens([{')',LN}|Rest], Args, ParsedTokens) -> vert_compexpr_tokens(Rest, Args, [{')',LN}|ParsedTokens]); vert_compexpr_tokens([{'andalso', LN}|Rest], Args, ParsedTokens) -> vert_compexpr_tokens(Rest, Args, [{'andalso', LN}|ParsedTokens]); -vert_compexpr_tokens([{'or', LN}|Rest], Args, ParsedTokens) -> - vert_compexpr_tokens(Rest, Args, [{'or', LN}|ParsedTokens]); +vert_compexpr_tokens([{'orelse', LN}|Rest], Args, ParsedTokens) -> + vert_compexpr_tokens(Rest, Args, [{'orelse', LN}|ParsedTokens]); vert_compexpr_tokens([{'>', LN}|Rest], Args, ParsedTokens) -> vert_compexpr_tokens(Rest, Args, [{'>', LN}|ParsedTokens]); vert_compexpr_tokens([{'<', LN}|Rest], Args, ParsedTokens) -> @@ -104,38 +105,38 @@ vert_compexpr_tokens([InvalidToken|_Rest], _Args, _ParsedTokens) -> good_querystring_test() -> QueryString = "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" - " or (\"$dod\" > \"20200101\" andalso \"$dod\" < \"20230101\").", + " orelse (\"$dod\" > \"20200101\" andalso \"$dod\" < \"20230101\").", AvailableArgs = ["dob", "dod"], F = validate_comparison_expression(QueryString, AvailableArgs), - M1 = maps:from_list([{dob, "19750202"}, {dod, "20221216"}]), - M2 = maps:from_list([{dob, "19750202"}, {dod, "20191216"}]), - M3 = maps:from_list([{dob, "19790202"}, {dod, "20221216"}]), - M4 = maps:from_list([{dob, "19790202"}, {dod, "20191216"}]), - M5 = maps:from_list([{dob, "19790202"}, {dod, "20241216"}]), - ?assertMatch(true, element(2, F(M1))), - ?assertMatch(true, element(2, F(M2))), - ?assertMatch(true, element(2, F(M3))), - ?assertMatch(false, element(2, F(M4))), - ?assertMatch(false, element(2, F(M5))). + M1 = maps:from_list([{dob, <<"19750202">>}, {dod, <<"20221216">>}]), + M2 = maps:from_list([{dob, <<"19750202">>}, {dod, <<"20191216">>}]), + M3 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20221216">>}]), + M4 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20191216">>}]), + M5 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20241216">>}]), + ?assertMatch(true, F(M1)), + ?assertMatch(true, F(M2)), + ?assertMatch(true, F(M3)), + ?assertMatch(false, F(M4)), + ?assertMatch(false, F(M5)). bad_querystring_test() -> QueryString = "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" - " or (\"$dod\" > \"20200101\").", + " orelse (\"$dod\" > \"20200101\").", BadString1 = "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" - " or (\"$dod\" > \"20200101\")", + " orelse (\"$dod\" > \"20200101\")", BadString2 = "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" - " or (\"$dod\" > \"20200101\")).", + " orelse (\"$dod\" > \"20200101\")).", BadString3 = "(\"$dob\" >= \"19740301\" and \"$dob\" =< \"19761030\")" - " or (\"$dod\" > \"20200101\").", + " orelse (\"$dod\" > \"20200101\").", BadString4 = "os:cmd(foo)", BadString5 = [42,63,52,10,240,159,140,190] ++ - "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" - " or (\"$dod\" > \"20200101\").", + "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< <<\"19761030\")" + " orelse (\"$dod\" > \"20200101\").", BadString6 = [(16#110000 + 1)|QueryString], BadString7 = <<42:32/integer>>, ?assertMatch( diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index 899601ca..0cf0ebb5 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -1,6 +1,5 @@ -module(iterator_SUITE). --include_lib("common_test/include/ct.hrl"). -include("include/leveled.hrl"). -define(KEY_ONLY, {false, undefined}). @@ -18,14 +17,14 @@ ]). all() -> [ - expiring_indexes, - breaking_folds, - single_object_with2i, - small_load_with2i, - query_count, - multibucket_fold, - rotating_objects, - foldobjects_bybucket_range, + % expiring_indexes, + % breaking_folds, + % single_object_with2i, + % small_load_with2i, + % query_count, + % multibucket_fold, + % rotating_objects, + % foldobjects_bybucket_range, capture_and_filter_terms ]. @@ -885,7 +884,8 @@ capture_and_filter_terms(_Config) -> lists:nth( 2, string:tokens(binary_to_list(IdxValue), "|") - )), + ) + ), case (DoB >= StartDoB) andalso (DoB =< EndDoB) of true -> {true, Key}; @@ -902,7 +902,8 @@ capture_and_filter_terms(_Config) -> "[^\\|]*\\|(?P[0-9]{8})\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" "[^\\|]*#LS[^\\|]*", FilterFun1 = - fun([{<<"dob">>, DoB}]) -> + fun(Captures) -> + DoB = maps:get(dob, Captures), (DoB >= StartDoB) andalso (DoB =< EndDoB) end, {ok, WillowLeedsExtractorPCRE} = re:compile(WillowLeedsExtractor), @@ -912,7 +913,7 @@ capture_and_filter_terms(_Config) -> {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {false, - {capture, WillowLeedsExtractorPCRE, [<<"dob">>], FilterFun1}} + {capture, WillowLeedsExtractorPCRE, [dob], FilterFun1}} }, {async, RunnerPCRE1} = leveled_bookie:book_returnfolder(Book1, QueryPCRE1), BornMid70sPCRE1 = RunnerPCRE1(), @@ -926,7 +927,7 @@ capture_and_filter_terms(_Config) -> {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {false, - {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}} + {capture, WillowLeedsExtractorRE2, [dob], FilterFun1}} }, {async, RunnerRE2_2} = leveled_bookie:book_returnfolder(Book1, QueryRE2_2), BornMid70sRE2_2 = RunnerRE2_2(), @@ -938,8 +939,8 @@ capture_and_filter_terms(_Config) -> {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {<<"dob">>, - {capture, WillowLeedsExtractorRE2, [<<"dob">>], fun(_) -> true end}} + {dob, + {capture, WillowLeedsExtractorRE2, [dob], fun(_) -> true end}} }, {async, RunnerRE2_3} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3), Results3 = RunnerRE2_3(), @@ -964,7 +965,7 @@ capture_and_filter_terms(_Config) -> FilterFun2 = fun(Captures) -> - {<<"dob">>, DoB} = hd(Captures), + DoB = maps:get(dob, Captures), (DoB >= StartDoB) andalso (DoB =< EndDoB) end, QueryRE2_4 = @@ -975,7 +976,7 @@ capture_and_filter_terms(_Config) -> {false, {capture, element(2, re2:compile(WillowLeedsDoubleExtractor)), - [<<"dob">>, <<"dod">>], + [dob, dod], FilterFun2}} }, {async, RunnerRE2_4} = leveled_bookie:book_returnfolder(Book1, QueryRE2_4), @@ -989,7 +990,7 @@ capture_and_filter_terms(_Config) -> {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {true, - {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}} + {capture, WillowLeedsExtractorRE2, [dob], FilterFun1}} }, {async, RunnerRE2_5} = leveled_bookie:book_returnfolder(Book1, QueryRE2_5), BornMid70sRE2_5 = @@ -1001,18 +1002,82 @@ capture_and_filter_terms(_Config) -> end, RunnerRE2_5()), + SW6 = os:timestamp(), + + ComparatorExpression3 = + "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\") andalso (\"$dod\" > \"19000101\" orelse \"$dod\" < \"20501231\").", + + FilterFun3 = + leveled_filter:validate_comparison_expression( + ComparatorExpression3, ["dob", "dod"]), + QueryRE2_6 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, + {capture, + element(2, re2:compile(WillowLeedsDoubleExtractor)), + [dob, dod], + FilterFun3}} + }, + {async, RunnerRE2_6} = leveled_bookie:book_returnfolder(Book1, QueryRE2_6), + BornMid70sRE2_6 = RunnerRE2_6(), + + SW7 = os:timestamp(), + + ComparatorExpression4 = + "\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\".", + + FilterFun4 = + leveled_filter:validate_comparison_expression( + ComparatorExpression4, ["dob"]), + QueryRE2_7 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, + {capture, + element(2, re2:compile(WillowLeedsExtractor)), + [dob], + FilterFun4}} + }, + {async, RunnerRE2_7} = leveled_bookie:book_returnfolder(Book1, QueryRE2_7), + BornMid70sRE2_7 = RunnerRE2_7(), + + SW8 = os:timestamp(), + + PreFilterRE = + "[^\\|]*\\|(?P197[4-6]{1}[0-9]{4})\\|" + "[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" + "[^\\|]*#LS[^\\|]*", + QueryRE2_8 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, + {capture, + element(2, re2:compile(PreFilterRE)), + [dob], + FilterFun4}} + }, + {async, RunnerRE2_8} = leveled_bookie:book_returnfolder(Book1, QueryRE2_8), + BornMid70sRE2_8 = RunnerRE2_8(), + + SW9 = os:timestamp(), + true = length(BornMid70s0) > 0, - true = length(BornMid70sPCRE1) > 0, - true = length(BornMid70sRE2_2) > 0, - true = length(BornMid70sRE2_3) > 0, - true = length(BornMid70sRE2_4) > 0, - true = length(BornMid70sRE2_5) > 0, true = lists:sort(BornMid70s0) == lists:sort(BornMid70sPCRE1), true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_2), true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_3), true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_4), true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_5), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_6), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_7), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_8), io:format( % user, @@ -1034,6 +1099,18 @@ capture_and_filter_terms(_Config) -> % user, "~nRE2 double-capture filter outside took ~w ms~n", [timer:now_diff(SW5, SW4) div 1000]), + io:format( + % user, + "~nRE2 double-capture filter with parsed query string took ~w ms~n", + [timer:now_diff(SW7, SW6) div 1000]), + io:format( + % user, + "~nRE2 single-capture filter with parsed query string took ~w ms~n", + [timer:now_diff(SW8, SW7) div 1000]), + io:format( + % user, + "~nRE2 single-capture pre-filter with parsed query string took ~w ms~n", + [timer:now_diff(SW9, SW8) div 1000]), ok = leveled_bookie:book_close(Book1), From 2b61a1372918085ef12c573959e256e6899752bb Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Sat, 27 Apr 2024 22:15:17 +0100 Subject: [PATCH 4/8] Add Filter Expression support Allow for capture with either delimited index terms, or with regex capture, and then filtering of captured attributes using a logical expression. The xref check `locals_not_used` is now ignored due to issues with auto-generated module. --- .gitignore | 2 + rebar.config | 5 +- src/leveled_codec.erl | 67 +++-- src/leveled_filter.erl | 436 ++++++++++++++++++++--------- src/leveled_filterlexer.xrl | 49 ++++ src/leveled_filterparser.yrl | 50 ++++ test/end_to_end/iterator_SUITE.erl | 413 +++++++++++++-------------- test/end_to_end/perf_SUITE.erl | 61 +++- 8 files changed, 703 insertions(+), 380 deletions(-) create mode 100644 src/leveled_filterlexer.xrl create mode 100644 src/leveled_filterparser.yrl diff --git a/.gitignore b/.gitignore index 3906bbcd..2dc6c0c8 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ cover_* .eqc-info leveled_data/* compile_commands.json +*parser.erl +*lexer.erl diff --git a/rebar.config b/rebar.config index e73ff4b7..f2b6ff10 100644 --- a/rebar.config +++ b/rebar.config @@ -4,11 +4,11 @@ {xref_checks, [undefined_function_calls,undefined_functions, - locals_not_used, deprecated_function_calls, deprecated_functions]}. {cover_excl_mods, - [testutil, + [leveled_filterlexer, leveled_filterparser, + testutil, appdefined_SUITE, basic_SUITE, iterator_SUITE, perf_SUITE, recovery_SUITE, riak_SUITE, tictac_SUITE]}. @@ -23,6 +23,7 @@ ]}, {test, [{extra_src_dirs, ["test/end_to_end", "test/property"]} ]}, + {perf_fexp, [{erl_opts, [{d, test_filter_expression}]}]}, {perf_full, [{erl_opts, [{d, performance, riak_fullperf}]}]}, {perf_mini, [{erl_opts, [{d, performance, riak_miniperf}]}]}, {perf_prof, [{erl_opts, [{d, performance, riak_profileperf}]}]} diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 7426fb23..9270668e 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -116,7 +116,8 @@ reference()|{re_pattern, term(), term(), term(), term()}. -type capture_filter_fun() :: fun((#{atom() => string()}) -> boolean()). -type capture_reference() :: - {capture, actual_regex(), list(atom()), capture_filter_fun()}. + {capture, actual_regex(), list(atom()|binary()), capture_filter_fun()}| + {delimited, string(), list(binary()), capture_filter_fun()}. -type regular_expression() :: actual_regex()|undefined|capture_reference(). @@ -313,26 +314,39 @@ accumulate_index( Opts = [{capture, all_but_first, binary}], fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> case leveled_util:regex_run(IdxValue, TermRegex, Opts) of - {match, CptTerms} when length(CptTerms) == ExpectedKeys -> - CptMap = maps:from_list(lists:zip(CptKeys, CptTerms)), - case FilterFun(CptMap) of - true -> - case AddTerm of - true -> - FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc); - false -> - FoldKeysFun(Bucket, ObjKey, Acc); - CptKey when is_atom(CptKey) -> - CptValue = maps:get(CptKey, CptMap), - FoldKeysFun(Bucket, {CptValue, ObjKey}, Acc) - end; - false -> - Acc - end; + {match, CptTerms} -> + L = min(length(CptTerms), ExpectedKeys), + check_captured_terms( + lists:sublist(CptTerms, L), + lists:sublist(CptKeys, L), + FilterFun, + AddTerm, + FoldKeysFun, + Bucket, + IdxValue, + ObjKey, + Acc); _ -> Acc end end; +accumulate_index( + {AddTerm, {delimited, Delim, CptKeys, FilterFun}}, FoldKeysFun) -> + ExpectedKeys = length(CptKeys), + fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> + CptTerms = string:split(IdxValue, Delim, all), + L = min(length(CptTerms), ExpectedKeys), + check_captured_terms( + lists:sublist(CptTerms, L), + lists:sublist(CptKeys, L), + FilterFun, + AddTerm, + FoldKeysFun, + Bucket, + IdxValue, + ObjKey, + Acc) + end; accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> case leveled_util:regex_run(IdxValue, TermRegex, []) of @@ -348,6 +362,25 @@ accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> end end. +check_captured_terms( + CptTerms, CptKeys, FilterFun, AddTerm, FoldKeysFun, B, IdxValue, ObjKey, Acc) -> + CptMap = maps:from_list(lists:zip(CptKeys, CptTerms)), + case FilterFun(CptMap) of + true -> + case AddTerm of + true -> + FoldKeysFun(B, {IdxValue, ObjKey}, Acc); + false -> + FoldKeysFun(B, ObjKey, Acc); + CptKey when is_atom(CptKey) -> + CptValue = maps:get(CptKey, CptMap), + FoldKeysFun(B, {CptValue, ObjKey}, Acc) + end; + false -> + Acc + end. + + -spec key_dominates(ledger_kv(), ledger_kv()) -> boolean(). %% @doc diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl index a273b128..43fb1573 100644 --- a/src/leveled_filter.erl +++ b/src/leveled_filter.erl @@ -5,93 +5,135 @@ -module(leveled_filter). --export([validate_comparison_expression/2]). +-export( + [ + generate_filter_expression/2, + apply_filter/2 + ]). %%%============================================================================ %%% External API %%%============================================================================ -%% @doc validate a comparison expression -%% A comparison expression allows for string comparisons to be made using -%% >, <, >=, =< with andalso, orelse as well as () to group different -%% expressions. -%% The output is a function which will be passed a Map where the Map may be the -%% result of reading some projected attributes on an index string. -%% To compare a string with a key in the map, the name of the key must begin -%% with a "$". -%% In creating the function, the potential keys must be known and passed as a -%% string in the list (without the leading $). --spec validate_comparison_expression( - string(), list(string())) -> - fun((map()) -> {value, boolean(), map()}) | - {error, string(), erl_anno:location()}. -validate_comparison_expression( - Expression, AvailableArgs) when is_list(Expression) -> - case erl_scan:string(Expression) of - {ok, Tokens, _EndLoc} -> - case vert_compexpr_tokens(Tokens, AvailableArgs, []) of - {ok, UpdTokens} -> - case erl_parse:parse_exprs(UpdTokens) of - {ok, [Form]} -> - fun(LookupMap) -> - element(2, erl_eval:expr(Form, LookupMap)) - end; - {error, ErrorInfo} -> - {error, ErrorInfo, undefined} - end; - {error, Error, ErrorLocation} -> - {error, lists:flatten(Error), ErrorLocation} + +apply_filter({condition, Condition}, AttrMap) -> + apply_filter(Condition, AttrMap); +apply_filter({'OR', P1, P2}, AttrMap) -> + apply_filter(P1, AttrMap) orelse apply_filter(P2, AttrMap); +apply_filter({'AND', P1, P2}, AttrMap) -> + apply_filter(P1, AttrMap) andalso apply_filter(P2, AttrMap); +apply_filter({'NOT', P1}, AttrMap) -> + not apply_filter(P1, AttrMap); +apply_filter({'BETWEEN', {identifier, _, ID}, CompA, CompB}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + notfound -> + false; + V -> + case {element(3, CompA), element(3, CompB)} of + {Low, High} when Low =< High -> + V >= Low andalso V =< High; + {High, Low} -> + V >= Low andalso V =< High + end + end; +apply_filter({'IN', {identifier, _, ID}, CheckList}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + notfound -> + false; + V -> + lists:member(V, lists:map(fun(C) -> element(3, C) end, CheckList)) + end; +apply_filter({{comparator, Cmp, _}, {identifier, _ , ID}, CmpA}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + notfound -> + false; + V -> + case {element(1, CmpA), V} of + {string, V} when is_binary(V) -> + compare(Cmp, V, element(3, CmpA)); + {integer, V} when is_integer(V) -> + compare(Cmp, V, element(3, CmpA)); + _ -> + false + end + end; +apply_filter({contains, {identifier, _, ID}, {string, _ , SubStr}}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + V when is_binary(V) -> + case string:find(V, SubStr) of + nomatch -> + false; + _ -> + true end; - {error, Error, ErrorLocation} -> - {error, Error, ErrorLocation} + _ -> + false end; -validate_comparison_expression(_Expression, _AvailableArgs) -> - {error, "Expression not a valid string", undefined}. +apply_filter({begins_with, {identifier, _, ID}, {string, _ , SubStr}}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + V when is_binary(V) -> + case string:prefix(V, SubStr) of + nomatch -> + false; + _ -> + true + end; + _ -> + false + end; +apply_filter({attribute_exists, {identifier, _, ID}}, AttrMap) -> + maps:is_key(ID, AttrMap); +apply_filter({attribute_not_exists, {identifier, _, ID}}, AttrMap) -> + not maps:is_key(ID, AttrMap); +apply_filter({attribute_empty, {identifier, _, ID}}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + <<>> -> + true; + _ -> + false + end +. + +generate_filter_expression(FilterString, Substitutions) -> + {ok, Tokens, _EndLine} = leveled_filterlexer:string(FilterString), + case substitute_items(Tokens, Substitutions, []) of + {error, Error} -> + {error, Error}; + UpdTokens -> + leveled_filterparser:parse(UpdTokens) + end. %%%============================================================================ %%% Internal functions %%%============================================================================ -vert_compexpr_tokens([], _Args, ParsedTokens) -> - {ok, lists:reverse(ParsedTokens)}; -vert_compexpr_tokens([{dot, LN}], Args, ParsedTokens) -> - vert_compexpr_tokens([], Args, [{dot, LN}|ParsedTokens]); -vert_compexpr_tokens(L, _Args, _ParsedTokens) when length(L) == 1 -> - {error, "Query does not end with `.`", undefined}; -vert_compexpr_tokens([{string, LN, String}|Rest], Args, ParsedTokens) -> - case hd(String) of - 36 -> % 36 == $ - Arg = tl(String), - case lists:member(Arg, Args) of - true -> - VarToken = {var, LN, list_to_atom(Arg)}, - vert_compexpr_tokens(Rest, Args, [VarToken|ParsedTokens]); - false -> - {error, io_lib:format("Unavailable Arg ~s", [Arg]), LN} - end; - _ -> % Not a key, treat as a binary - BinString = [{'>>', LN}, {string, LN, String}, {'<<', LN}], - vert_compexpr_tokens( - Rest, Args, BinString ++ ParsedTokens) +compare('>', V, CmpA) -> V > CmpA; +compare('>=', V, CmpA) -> V >= CmpA; +compare('<', V, CmpA) -> V < CmpA; +compare('<=', V, CmpA) -> V =< CmpA; +compare('=', V, CmpA) -> V == CmpA; +compare('<>', V, CmpA) -> V =/= CmpA. + +substitute_items([], _Subs, UpdTokens) -> + lists:reverse(UpdTokens); +substitute_items([{substitution, LN, ID}|Rest], Subs, UpdTokens) -> + case maps:get(ID, Subs, notfound) of + notfound -> + {error, + lists:flatten( + io_lib:format("Substitution ~p not found", [ID]))}; + Value when is_binary(Value) -> + substitute_items( + Rest, Subs, [{string, LN, binary_to_list(Value)}|UpdTokens]); + Value when is_integer(Value) -> + substitute_items(Rest, Subs, [{integer, LN, Value}|UpdTokens]); + _UnexpectedValue -> + {error, + lists:flatten( + io_lib:format("Substitution ~p unexpected type", [ID]))} end; -vert_compexpr_tokens([{'(',LN}|Rest], Args, ParsedTokens) -> - vert_compexpr_tokens(Rest, Args, [{'(',LN}|ParsedTokens]); -vert_compexpr_tokens([{')',LN}|Rest], Args, ParsedTokens) -> - vert_compexpr_tokens(Rest, Args, [{')',LN}|ParsedTokens]); -vert_compexpr_tokens([{'andalso', LN}|Rest], Args, ParsedTokens) -> - vert_compexpr_tokens(Rest, Args, [{'andalso', LN}|ParsedTokens]); -vert_compexpr_tokens([{'orelse', LN}|Rest], Args, ParsedTokens) -> - vert_compexpr_tokens(Rest, Args, [{'orelse', LN}|ParsedTokens]); -vert_compexpr_tokens([{'>', LN}|Rest], Args, ParsedTokens) -> - vert_compexpr_tokens(Rest, Args, [{'>', LN}|ParsedTokens]); -vert_compexpr_tokens([{'<', LN}|Rest], Args, ParsedTokens) -> - vert_compexpr_tokens(Rest, Args, [{'<', LN}|ParsedTokens]); -vert_compexpr_tokens([{'>=', LN}|Rest], Args, ParsedTokens) -> - vert_compexpr_tokens(Rest, Args, [{'>=', LN}|ParsedTokens]); -vert_compexpr_tokens([{'=<', LN}|Rest], Args, ParsedTokens) -> - vert_compexpr_tokens(Rest, Args, [{'=<', LN}|ParsedTokens]); -vert_compexpr_tokens([InvalidToken|_Rest], _Args, _ParsedTokens) -> - {error, io_lib:format("Invalid token ~w", [InvalidToken]), undefined}. +substitute_items([Token|Rest], Subs, UpdTokens) -> + substitute_items(Rest, Subs, [Token|UpdTokens]). %%%============================================================================ @@ -102,74 +144,198 @@ vert_compexpr_tokens([InvalidToken|_Rest], _Args, _ParsedTokens) -> -include_lib("eunit/include/eunit.hrl"). -good_querystring_test() -> - QueryString = - "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" - " orelse (\"$dod\" > \"20200101\" andalso \"$dod\" < \"20230101\").", - AvailableArgs = ["dob", "dod"], - F = validate_comparison_expression(QueryString, AvailableArgs), - M1 = maps:from_list([{dob, <<"19750202">>}, {dod, <<"20221216">>}]), - M2 = maps:from_list([{dob, <<"19750202">>}, {dod, <<"20191216">>}]), - M3 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20221216">>}]), - M4 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20191216">>}]), - M5 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20241216">>}]), - ?assertMatch(true, F(M1)), - ?assertMatch(true, F(M2)), - ?assertMatch(true, F(M3)), - ?assertMatch(false, F(M4)), - ?assertMatch(false, F(M5)). - -bad_querystring_test() -> - QueryString = - "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" - " orelse (\"$dod\" > \"20200101\").", - BadString1 = - "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" - " orelse (\"$dod\" > \"20200101\")", - BadString2 = - "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")" - " orelse (\"$dod\" > \"20200101\")).", - BadString3 = - "(\"$dob\" >= \"19740301\" and \"$dob\" =< \"19761030\")" - " orelse (\"$dod\" > \"20200101\").", - BadString4 = "os:cmd(foo)", - BadString5 = - [42,63,52,10,240,159,140,190] ++ - "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< <<\"19761030\")" - " orelse (\"$dod\" > \"20200101\").", - BadString6 = [(16#110000 + 1)|QueryString], - BadString7 = <<42:32/integer>>, - ?assertMatch( - {error, "Unavailable Arg dod", 1}, - validate_comparison_expression(QueryString, ["dob", "pv"]) - ), +invalid_filterexpression_test() -> + FE1 = "($a BETWEEN \"A\" AND \"A12\") OR (($b >= \"30\") AND contains($c, :d))", + SubsMissing = maps:from_list([{<<"a">>, <<"MA">>}]), ?assertMatch( - {error, "Query does not end with `.`", undefined}, - validate_comparison_expression(BadString1, ["dob", "dod"]) + {error, "Substitution <<\"d\">> not found"}, + generate_filter_expression(FE1, SubsMissing) ), + SubsWrongType = maps:from_list([{<<"d">>, "42"}]), ?assertMatch( - {error, {1 ,erl_parse, ["syntax error before: ","')'"]}, undefined}, - validate_comparison_expression(BadString2, ["dob", "dod"]) + {error, "Substitution <<\"d\">> unexpected type"}, + generate_filter_expression(FE1, SubsWrongType) ), + SubsPresent = maps:from_list([{<<"d">>, <<"MA">>}]), + FE2 = "($a BETWEEN \"A\" AND 12) OR (($b >= \"30\") AND contains($c, :d))", ?assertMatch( - {error, "Invalid token {'and',1}", undefined}, - validate_comparison_expression(BadString3, ["dob", "dod"]) + {error, {1, leveled_filterparser,["syntax error before: ","12"]}}, + generate_filter_expression(FE2, SubsPresent) ), + SubsWrongTypeForContains = maps:from_list([{<<"d">>, 42}]), + FE4 = "($a BETWEEN 12 AND 12) OR (($b >= \"30\") AND contains($c, :d))", ?assertMatch( - {error, "Invalid token {atom,1,os}", undefined}, - validate_comparison_expression(BadString4, ["dob", "dod"]) - ), - ?assertMatch( - {error, "Invalid token {'*',1}", undefined}, - validate_comparison_expression(BadString5, ["dob", "dod"]) - ), - ?assertMatch( - {error, {1, erl_scan, {illegal,character}}, 1}, - validate_comparison_expression(BadString6, ["dob", "dod"]) - ), - ?assertMatch( - {error, "Expression not a valid string", undefined}, - validate_comparison_expression(BadString7, ["dob", "dod"]) + {error, {1, leveled_filterparser, ["syntax error before: ","42"]}}, + generate_filter_expression(FE4, SubsWrongTypeForContains) ). +filterexpression_test() -> + FE1 = "($a BETWEEN \"A\" AND \"A12\") AND (($b >= 30) AND contains($c, :d))", + SubsPresent = maps:from_list([{<<"d">>, <<"MA">>}]), + {ok, Filter1} = generate_filter_expression(FE1, SubsPresent), + M1 = #{<<"a">> => <<"A11">>, <<"b">> => 100, <<"c">> => <<"CARTMAN">>}, + ?assert(apply_filter(Filter1, M1)), + % ok + + M2 = #{<<"a">> => <<"A11">>, <<"b">> => 10, <<"c">> => <<"CARTMAN">>}, + ?assertNot(apply_filter(Filter1, M2)), + % $b < 30 + + FE2 = "($a BETWEEN \"A\" AND \"A12\") AND (($b >= 30) OR contains($c, :d))", + {ok, Filter2} = generate_filter_expression(FE2, SubsPresent), + ?assert(apply_filter(Filter2, M2)), + % OR used so ($b >= 30) = false is ok + + FE3 = "($a BETWEEN \"A12\" AND \"A\") AND (($b >= 30) OR contains($c, :d))", + {ok, Filter3} = generate_filter_expression(FE3, SubsPresent), + ?assert(apply_filter(Filter3, M2)), + % swapping the low/high - still ok + + M3 = #{<<"a">> => <<"A11">>, <<"b">> => <<"100">>, <<"c">> => <<"CARTMAN">>}, + ?assertNot(apply_filter(Filter1, M3)), + % substitution b is not an integer + + FE4 = + "($dob BETWEEN \"19700101\" AND \"19791231\") " + "AND (contains($gns, \"#Willow\") AND contains($pcs, \"#LS\"))", + {ok, Filter4} = generate_filter_expression(FE4, maps:new()), + M4 = + #{ + <<"dob">> => <<"19751124">>, + <<"gns">> => <<"#Mia#Willow#Chloe">>, + <<"pcs">> => <<"#BD1 1DU#LS1 4BT">> + }, + ?assert(apply_filter(Filter4, M4)), + + FE5 = + "($dob >= \"19740301\" AND $dob <= \"19761030\")" + " OR ($dod > \"20200101\" AND $dod < \"20230101\")", + + {ok, Filter5} = generate_filter_expression(FE5, maps:new()), + F = fun(M) -> apply_filter(Filter5, M) end, + + M5 = maps:from_list([{<<"dob">>, <<"19750202">>}, {<<"dod">>, <<"20221216">>}]), + M6 = maps:from_list([{<<"dob">>, <<"19750202">>}, {<<"dod">>, <<"20191216">>}]), + M7 = maps:from_list([{<<"dob">>, <<"19790202">>}, {<<"dod">>, <<"20221216">>}]), + M8 = maps:from_list([{<<"dob">>, <<"19790202">>}, {<<"dod">>, <<"20191216">>}]), + M9 = maps:from_list([{<<"dob">>, <<"19790202">>}, {<<"dod">>, <<"20241216">>}]), + M10 = maps:new(), + ?assertMatch(true, F(M5)), + ?assertMatch(true, F(M6)), + ?assertMatch(true, F(M7)), + ?assertMatch(false, F(M8)), + ?assertMatch(false, F(M9)), + ?assertMatch(false, F(M10)), + + FE5A = + "($dob >= \"19740301\" AND $dob <= \"19761030\")" + " AND ($dod = \"20221216\")", + {ok, Filter5A} = generate_filter_expression(FE5A, maps:new()), + ?assert(apply_filter(Filter5A, M5)), + ?assertNot(apply_filter(Filter5A, M6)), + + FE6 = + "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")" + " OR $dob <> \"19993112\"", + {ok, Filter6} = generate_filter_expression(FE6, maps:new()), + M11 = maps:from_list([{<<"dob">>, <<"19993112">>}]), + ?assertMatch(false, apply_filter(Filter6, M11)), + + FE7 = + "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")" + " OR $dob = \"19993112\"", + {ok, Filter7} = generate_filter_expression(FE7, maps:new()), + ?assert(apply_filter(Filter7, M11)), + + FE8 = "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")" + " OR $dob IN (\"19910301\", \"19910103\")", + {ok, Filter8} = generate_filter_expression(FE8, maps:new()), + ?assert(apply_filter(Filter8, #{<<"dob">> => <<"19910301">>})), + ?assert(apply_filter(Filter8, #{<<"dob">> => <<"19910103">>})), + ?assertNot(apply_filter(Filter8, #{<<"dob">> => <<"19910102">>})), + ?assertNot(apply_filter(Filter8, #{<<"gn">> => <<"Nikki">>})), + + FE9 = "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")" + " OR $dob IN (\"19910301\", 19910103)", + % Only match with a type match + {ok, Filter9} = generate_filter_expression(FE9, maps:new()), + ?assert(apply_filter(Filter9, #{<<"dob">> => <<"19910301">>})), + ?assert(apply_filter(Filter9, #{<<"dob">> => 19910103})), + ?assertNot(apply_filter(Filter9, #{<<"dob">> => 19910301})), + ?assertNot(apply_filter(Filter9, #{<<"dob">> => <<"19910103">>})), + + FE10 = "NOT contains($gn, \"MA\") AND " + "(NOT $dob IN (\"19910301\", \"19910103\"))", + {ok, Filter10} = generate_filter_expression(FE10, maps:new()), + ?assert( + apply_filter( + Filter10, + #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910201">>})), + ?assertNot( + apply_filter( + Filter10, + #{<<"gn">> => <<"EMMA">>, <<"dob">> => <<"19910201">>})), + ?assertNot( + apply_filter( + Filter10, + #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910301">>})), + + FE11 = "NOT contains($gn, \"MA\") AND " + "NOT $dob IN (\"19910301\", \"19910103\")", + {ok, Filter11} = generate_filter_expression(FE11, maps:new()), + ?assert( + apply_filter( + Filter11, + #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910201">>})), + ?assertNot( + apply_filter( + Filter11, + #{<<"gn">> => <<"EMMA">>, <<"dob">> => <<"19910201">>})), + ?assertNot( + apply_filter( + Filter11, + #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910301">>})), + + FE12 = "begins_with($gn, \"MA\") AND begins_with($fn, :fn)", + {ok, Filter12} = generate_filter_expression(FE12, #{<<"fn">> => <<"SU">>}), + ?assert( + apply_filter( + Filter12, + #{<<"gn">> => <<"MATTY">>, <<"fn">> => <<"SUMMER">>})), + ?assertNot( + apply_filter( + Filter12, + #{<<"gn">> => <<"MITTY">>, <<"fn">> => <<"SUMMER">>})), + ?assertNot( + apply_filter( + Filter12, + #{<<"gn">> => <<"MATTY">>, <<"fn">> => <<"SIMMS">>})), + ?assertNot( + apply_filter( + Filter12, + #{<<"gn">> => 42, <<"fn">> => <<"SUMMER">>})), + + FE13 = "attribute_exists($dob) AND attribute_not_exists($consent) " + "AND attribute_empty($dod)", + {ok, Filter13} = generate_filter_expression(FE13, maps:new()), + ?assert( + apply_filter( + Filter13, + #{<<"dob">> => <<"19440812">>, <<"dod">> => <<>>})), + ?assertNot( + apply_filter( + Filter13, + #{<<"dod">> => <<>>})), + ?assertNot( + apply_filter( + Filter13, + #{<<"dob">> => <<"19440812">>, + <<"consent">> => <<>>, + <<"dod">> => <<>>})), + ?assertNot( + apply_filter( + Filter13, + #{<<"dob">> => <<"19440812">>, <<"dod">> => <<"20240213">>})) + . + -endif. diff --git a/src/leveled_filterlexer.xrl b/src/leveled_filterlexer.xrl new file mode 100644 index 00000000..9fea88ff --- /dev/null +++ b/src/leveled_filterlexer.xrl @@ -0,0 +1,49 @@ +%% Lexer for filter and conditional expressions +%% Author: Thomas Arts + +Definitions. +WhiteSpace = ([\t\f\v\r\n\s]+) + +Rules. + +{WhiteSpace} : skip_token. + +\( : {token, {'(', TokenLine}}. +\) : {token, {')', TokenLine}}. + +, : {token, {',', TokenLine}}. +NOT : {token, {'NOT', TokenLine}}. +AND : {token, {'AND', TokenLine}}. +OR : {token, {'OR', TokenLine}}. +BETWEEN : {token, {'BETWEEN', TokenLine}}. +IN : {token, {'IN', TokenLine}}. += : {token, {comparator, '=', TokenLine}}. +< : {token, {comparator, '<', TokenLine}}. +> : {token, {comparator, '>', TokenLine}}. +<> : {token, {comparator, '<>', TokenLine}}. +<= : {token, {comparator, '<=', TokenLine}}. +>= : {token, {comparator, '>=', TokenLine}}. + +contains : {token, {'contains', TokenLine}}. +begins_with : {token, {'begins_with', TokenLine}}. +attribute_exists : {token, {'attribute_exists', TokenLine}}. +attribute_not_exists : {token, {'attribute_not_exists', TokenLine}}. +attribute_empty : {token, {'attribute_empty', TokenLine}}. + +\$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}. +\:[a-zA-Z_][a-zA-Z_0-9]* : {token, {substitution, TokenLine, strip_substitution(TokenChars)}}. +[0-9]+ : {token, {integer, TokenLine, list_to_integer(TokenChars)}}. +\"[^"]*\" : {token, {string, TokenLine, strip_string(TokenChars)}}. %" + +Erlang code. + +strip_string(TokenChars) -> + list_to_binary(lists:sublist(TokenChars, 2, length(TokenChars) -2)). + +strip_identifier(TokenChars) -> + [36|StrippedChars] = TokenChars, + list_to_binary(StrippedChars). + +strip_substitution(TokenChars) -> + [58|StrippedChars] = TokenChars, + list_to_binary(StrippedChars). \ No newline at end of file diff --git a/src/leveled_filterparser.yrl b/src/leveled_filterparser.yrl new file mode 100644 index 00000000..d9253590 --- /dev/null +++ b/src/leveled_filterparser.yrl @@ -0,0 +1,50 @@ +%% Grammar for filter expressions +%% Author: Thomas Arts + +Nonterminals +top_level condition operand operand_list operands. + + +Terminals +'(' ')' comparator identifier string integer +',' +'NOT' 'AND' 'OR' 'IN' 'BETWEEN' +'contains' 'begins_with' +'attribute_exists' 'attribute_not_exists' 'attribute_empty'. + + +Rootsymbol top_level. + +top_level -> condition: {condition, '$1'}. + +condition -> operand comparator operand : {'$2', '$1', '$3'}. +condition -> operand 'BETWEEN' string 'AND' string : {'BETWEEN', '$1', '$3', '$5'}. +condition -> operand 'BETWEEN' integer 'AND' integer : {'BETWEEN', '$1', '$3', '$5'}. +condition -> operand 'IN' operand_list : {'IN', '$1', '$3'}. +condition -> condition 'AND' condition : {'AND', '$1', '$3'}. +condition -> condition 'OR' condition : {'OR', '$1', '$3'}. +condition -> 'NOT' condition : {'NOT', '$2'}. +condition -> 'contains' '(' identifier ',' string ')' : {'contains', '$3', '$5'}. +condition -> 'begins_with' '(' identifier ',' string ')' : {'begins_with', '$3', '$5'}. +condition -> 'attribute_exists' '(' identifier ')' : {'attribute_exists', '$3'}. +condition -> 'attribute_not_exists' '(' identifier ')' : {'attribute_not_exists', '$3'}. +condition -> 'attribute_empty' '(' identifier ')' : {'attribute_empty', '$3'}. +condition -> '(' condition ')' : '$2'. + +operand -> identifier : '$1'. +operand -> string : '$1'. +operand -> integer : '$1'. + +operand_list -> '(' operands ')' : '$2'. + +operands -> operand ',' operands : ['$1' | '$3']. +operands -> operand : ['$1']. + +Endsymbol '$end'. + +Right 200 'NOT'. +Nonassoc 200 comparator. +Left 150 'AND'. +Left 100 'OR'. + +Erlang code. diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index 0cf0ebb5..955c036e 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -17,18 +17,17 @@ ]). all() -> [ - % expiring_indexes, - % breaking_folds, - % single_object_with2i, - % small_load_with2i, - % query_count, - % multibucket_fold, - % rotating_objects, - % foldobjects_bybucket_range, + expiring_indexes, + breaking_folds, + single_object_with2i, + small_load_with2i, + query_count, + multibucket_fold, + rotating_objects, + foldobjects_bybucket_range, capture_and_filter_terms ]. - expiring_indexes(_Config) -> % Add objects to the store with index entries, where the objects (and hence % the indexes have an expiry time. Confirm that the indexes and the @@ -50,11 +49,11 @@ expiring_indexes(_Config) -> SW1 = os:timestamp(), timer:sleep(1000), - V9 = testutil:get_compressiblevalue(), + V1 = <<"V1">>, Indexes9 = testutil:get_randomindexes_generator(2), TempRiakObjects = testutil:generate_objects( - KeyCount, binary_uuid, [], V9, Indexes9, "riakBucket"), + KeyCount, binary_uuid, [], V1, Indexes9, "riakBucket"), IBKL1 = testutil:stdload_expiring(Bookie1, KeyCount, Future), lists:foreach( @@ -144,11 +143,6 @@ expiring_indexes(_Config) -> Bookie1, B0, K0, 5, <<"value">>, leveled_util:integer_now() + 10), timer:sleep(1000), {async, Folder2} = IndexFold(), - leveled_bookie:book_indexfold(Bookie1, - B0, - {FoldFun, InitAcc}, - {<<"temp_int">>, 5, 8}, - {true, undefined}), QR2 = Folder2(), io:format("Query with additional entry length ~w~n", [length(QR2)]), true = lists:sort(QR2) == lists:sort([{5, B0, K0}|LoadedEntriesInRange]), @@ -203,13 +197,11 @@ breaking_folds(_Config) -> {max_journalsize, 10000000}, {sync_strategy, testutil:sync_strategy()}], {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), - ObjectGen = testutil:get_compressiblevalue_andinteger(), + V1 = testutil:get_compressiblevalue_andinteger(), IndexGen = testutil:get_randomindexes_generator(8), - ObjL1 = testutil:generate_objects(KeyCount, - binary_uuid, - [], - ObjectGen, - IndexGen), + ObjL1 = + testutil:generate_objects( + KeyCount, binary_uuid, [], V1, IndexGen), testutil:riakload(Bookie1, ObjL1), % Find all keys index, and then same again but stop at a midpoint using a @@ -284,10 +276,11 @@ breaking_folds(_Config) -> end end, {async, HeadFolderToMidK} = - leveled_bookie:book_headfold(Bookie1, - ?RIAK_TAG, - {FoldThrowFun(HeadFoldFun), []}, - true, true, false), + leveled_bookie:book_headfold( + Bookie1, + ?RIAK_TAG, + {FoldThrowFun(HeadFoldFun), []}, + true, true, false), KeySizeList2 = lists:reverse(CatchingFold(HeadFolderToMidK)), io:format("Head fold with result size ~w~n", [length(KeySizeList2)]), true = KeyCount div 2 == length(KeySizeList2), @@ -297,21 +290,19 @@ breaking_folds(_Config) -> [{K,byte_size(V)}|Acc] end, {async, ObjectFolderKO} = - leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {ObjFoldFun, []}, - false, - key_order), + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, {ObjFoldFun, []}, false, key_order), ObjSizeList1 = lists:reverse(ObjectFolderKO()), io:format("Obj fold with result size ~w~n", [length(ObjSizeList1)]), true = KeyCount == length(ObjSizeList1), {async, ObjFolderToMidK} = - leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {FoldThrowFun(ObjFoldFun), []}, - false, - key_order), + leveled_bookie:book_objectfold( + Bookie1, + ?RIAK_TAG, + {FoldThrowFun(ObjFoldFun), []}, + false, + key_order), ObjSizeList2 = lists:reverse(CatchingFold(ObjFolderToMidK)), io:format("Object fold with result size ~w~n", [length(ObjSizeList2)]), true = KeyCount div 2 == length(ObjSizeList2), @@ -321,11 +312,8 @@ breaking_folds(_Config) -> % that was terminated by reaching a point in the key range .. as results % will not be passed to the fold function in key order {async, ObjectFolderSO} = - leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {ObjFoldFun, []}, - false, - sqn_order), + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, {ObjFoldFun, []}, false, sqn_order), ObjSizeList1_SO = lists:reverse(ObjectFolderSO()), io:format("Obj fold with result size ~w~n", [length(ObjSizeList1_SO)]), true = KeyCount == length(ObjSizeList1_SO), @@ -343,33 +331,26 @@ breaking_folds(_Config) -> end end, {async, ObjFolderTo1K} = - leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {FoldThrowThousandFun(ObjFoldFun), []}, - false, - sqn_order), + leveled_bookie:book_objectfold( + Bookie1, + ?RIAK_TAG, + {FoldThrowThousandFun(ObjFoldFun), []}, + false, + sqn_order), ObjSizeList2_SO = lists:reverse(CatchingFold(ObjFolderTo1K)), io:format("Object fold with result size ~w~n", [length(ObjSizeList2_SO)]), true = 1000 == length(ObjSizeList2_SO), - ObjL2 = testutil:generate_objects(10, - binary_uuid, - [], - ObjectGen, - IndexGen, - "B2"), - ObjL3 = testutil:generate_objects(10, - binary_uuid, - [], - ObjectGen, - IndexGen, - "B3"), - ObjL4 = testutil:generate_objects(10, - binary_uuid, - [], - ObjectGen, - IndexGen, - "B4"), + ObjectGen = testutil:get_compressiblevalue_andinteger(), + ObjL2 = + testutil:generate_objects( + 10, binary_uuid, [], ObjectGen, IndexGen, "B2"), + ObjL3 = + testutil:generate_objects( + 10, binary_uuid, [], ObjectGen, IndexGen, "B3"), + ObjL4 = + testutil:generate_objects( + 10, binary_uuid, [], ObjectGen, IndexGen, "B4"), testutil:riakload(Bookie1, ObjL2), testutil:riakload(Bookie1, ObjL3), testutil:riakload(Bookie1, ObjL4), @@ -393,15 +374,12 @@ breaking_folds(_Config) -> end, {async, StopAt3BucketFolder} = - leveled_bookie:book_bucketlist(Bookie1, - ?RIAK_TAG, - {StopAt3Fun, []}, - all), + leveled_bookie:book_bucketlist( + Bookie1, ?RIAK_TAG, {StopAt3Fun, []}, all), BucketListSA3 = lists:reverse(CatchingFold(StopAt3BucketFolder)), io:format("bucket list with result ~w~n", [BucketListSA3]), true = [<<"B2">>, <<"B3">>] == BucketListSA3, - ok = leveled_bookie:book_close(Bookie1), testutil:reset_filestructure(). @@ -503,32 +481,29 @@ small_load_with2i(_Config) -> true = 1 == length(KeyList2), %% Delete the objects from the ChkList removing the indexes - lists:foreach(fun({_RN, Obj, Spc}) -> - DSpc = lists:map(fun({add, F, T}) -> - {remove, F, T} - end, - Spc), - {B, K} = - {testutil:get_bucket(Obj), testutil:get_key(Obj)}, - testutil:book_riakdelete(Bookie1, B, K, DSpc) - end, - ChkList1), + lists:foreach( + fun({_RN, Obj, Spc}) -> + DSpc = lists:map(fun({add, F, T}) -> + {remove, F, T} + end, + Spc), + {B, K} = + {testutil:get_bucket(Obj), testutil:get_key(Obj)}, + testutil:book_riakdelete(Bookie1, B, K, DSpc) + end, + ChkList1), %% Get the Buckets Keys and Hashes for the whole bucket - FoldObjectsFun = fun(B, K, V, Acc) -> [{B, K, erlang:phash2(V)}|Acc] - end, + FoldObjectsFun = + fun(B, K, V, Acc) -> [{B, K, erlang:phash2(V)}|Acc] end, - {async, HTreeF1} = leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {FoldObjectsFun, []}, - false), + {async, HTreeF1} = + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, {FoldObjectsFun, []}, false), KeyHashList1 = HTreeF1(), - {async, HTreeF2} = leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - "Bucket", - all, - {FoldObjectsFun, []}, - false), + {async, HTreeF2} = + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, "Bucket", all, {FoldObjectsFun, []}, false), KeyHashList2 = HTreeF2(), {async, HTreeF3} = leveled_bookie:book_objectfold( @@ -543,10 +518,11 @@ small_load_with2i(_Config) -> true = 9900 == length(KeyHashList2), true = 9900 == length(KeyHashList3), - SumIntFun = fun(_B, _K, Obj, Acc) -> - {I, _Bin} = testutil:get_value(Obj), - Acc + I - end, + SumIntFun = + fun(_B, _K, Obj, Acc) -> + {I, _Bin} = testutil:get_value(Obj), + Acc + I + end, BucketObjQ = {foldobjects_bybucket, ?RIAK_TAG, "Bucket", all, {SumIntFun, 0}, true}, {async, Sum1} = leveled_bookie:book_returnfolder(Bookie1, BucketObjQ), @@ -561,8 +537,9 @@ small_load_with2i(_Config) -> lists:foldl(SumFromObjLFun, 0, ObjL1), ChkList1Total = lists:foldl(SumFromObjLFun, 0, ChkList1), - io:format("Total in original object list ~w and from removed list ~w~n", - [ObjL1Total, ChkList1Total]), + io:format( + "Total in original object list ~w and from removed list ~w~n", + [ObjL1Total, ChkList1Total]), Total1 = ObjL1Total - ChkList1Total, @@ -600,7 +577,7 @@ query_count(_Config) -> testutil:check_forobject(Book1, TestObject), lists:foreach( fun(_X) -> - V = testutil:get_compressiblevalue(), + V = <<"TestValue">>, Indexes = testutil:get_randomindexes_generator(8), SW = os:timestamp(), ObjL1 = @@ -826,10 +803,9 @@ query_count(_Config) -> ok = leveled_bookie:book_close(Book4), - {ok, Book5} = leveled_bookie:book_start(RootPath, - 2000, - 50000000, - testutil:sync_strategy()), + {ok, Book5} = + leveled_bookie:book_start( + RootPath, 2000, 50000000, testutil:sync_strategy()), {async, BLF3} = leveled_bookie:book_returnfolder(Book5, BucketListQuery), SW_QC = os:timestamp(), BucketSet3 = BLF3(), @@ -849,14 +825,14 @@ capture_and_filter_terms(_Config) -> {ok, Book1} = leveled_bookie:book_start( RootPath, 2000, 50000000, testutil:sync_strategy()), - ObjectGen = testutil:get_compressiblevalue_andinteger(), + V1 = <<"V1">>, IndexGen = fun() -> [{add, IdxName, list_to_binary(perf_SUITE:random_people_index())}] end, ObjL1 = testutil:generate_objects( - 80000, uuid, [], ObjectGen, IndexGen, Bucket), + 100000, uuid, [], V1, IndexGen, Bucket), testutil:riakload(Book1, ObjL1), StartDoB = <<"19740301">>, @@ -1002,71 +978,83 @@ capture_and_filter_terms(_Config) -> end, RunnerRE2_5()), - SW6 = os:timestamp(), - - ComparatorExpression3 = - "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\") andalso (\"$dod\" > \"19000101\" orelse \"$dod\" < \"20501231\").", + SW8 = os:timestamp(), + + FilterExpression1 = "($dob BETWEEN \"19740301\" AND \"19761030\")", + {ok, FilterExpressionParsed1} = + leveled_filter:generate_filter_expression( + FilterExpression1, maps:new()), + FilterFun5 = + fun(AttrMap) -> + leveled_filter:apply_filter(FilterExpressionParsed1, AttrMap) + end, - FilterFun3 = - leveled_filter:validate_comparison_expression( - ComparatorExpression3, ["dob", "dod"]), - QueryRE2_6 = + QueryRE2_8 = {index_query, {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {false, {capture, - element(2, re2:compile(WillowLeedsDoubleExtractor)), - [dob, dod], - FilterFun3}} + element(2, re2:compile(WillowLeedsExtractor)), + [<<"dob">>], + FilterFun5}} }, - {async, RunnerRE2_6} = leveled_bookie:book_returnfolder(Book1, QueryRE2_6), - BornMid70sRE2_6 = RunnerRE2_6(), + {async, RunnerRE2_8} = leveled_bookie:book_returnfolder(Book1, QueryRE2_8), + BornMid70sRE2_8 = RunnerRE2_8(), - SW7 = os:timestamp(), - - ComparatorExpression4 = - "\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\".", - - FilterFun4 = - leveled_filter:validate_comparison_expression( - ComparatorExpression4, ["dob"]), - QueryRE2_7 = + SW9 = os:timestamp(), + + PreFilterRE = + "[^\\|]*\\|(?P197[4-6]{1}[0-9]{4})\\|" + "[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" + "[^\\|]*#LS[^\\|]*", + QueryRE2_9 = {index_query, {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {false, {capture, - element(2, re2:compile(WillowLeedsExtractor)), - [dob], - FilterFun4}} + element(2, re2:compile(PreFilterRE)), + [<<"dob">>], + FilterFun5}} }, - {async, RunnerRE2_7} = leveled_bookie:book_returnfolder(Book1, QueryRE2_7), - BornMid70sRE2_7 = RunnerRE2_7(), + {async, RunnerRE2_9} = leveled_bookie:book_returnfolder(Book1, QueryRE2_9), + BornMid70sRE2_9 = RunnerRE2_9(), - SW8 = os:timestamp(), + SW10 = os:timestamp(), - PreFilterRE = - "[^\\|]*\\|(?P197[4-6]{1}[0-9]{4})\\|" - "[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" + WillowLeedsExtractor = + "[^\\|]*\\|(?P[0-9]{8})\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" "[^\\|]*#LS[^\\|]*", - QueryRE2_8 = + + FilterExpression2 = + "($dob BETWEEN \"19740301\" AND \"19761030\")" + "AND (contains($gns, \"#Willow\") AND contains($pcs, \"#LS\"))", + {ok, FilterExpressionParsed2} = + leveled_filter:generate_filter_expression( + FilterExpression2, maps:new()), + FilterFun6 = + fun(AttrMap) -> + leveled_filter:apply_filter(FilterExpressionParsed2, AttrMap) + end, + + QueryRE2_10 = {index_query, {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {false, - {capture, - element(2, re2:compile(PreFilterRE)), - [dob], - FilterFun4}} + {delimited, + "|", + [<<"surname">>, <<"dob">>, <<"dod">>, <<"gns">>, <<"pcs">>], + FilterFun6}} }, - {async, RunnerRE2_8} = leveled_bookie:book_returnfolder(Book1, QueryRE2_8), - BornMid70sRE2_8 = RunnerRE2_8(), + {async, RunnerRE2_10} = leveled_bookie:book_returnfolder(Book1, QueryRE2_10), + BornMid70sRE2_10 = RunnerRE2_10(), - SW9 = os:timestamp(), + SW11 = os:timestamp(), true = length(BornMid70s0) > 0, @@ -1075,47 +1063,46 @@ capture_and_filter_terms(_Config) -> true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_3), true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_4), true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_5), - true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_6), - true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_7), true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_8), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_9), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_10), - io:format( - % user, + maybe_log_toscreen( "~nFilter outside took ~w ms~n", [timer:now_diff(SW1, SW0) div 1000]), - io:format( - % user, + maybe_log_toscreen( "~nPCRE Capture filter inside took ~w ms~n", [timer:now_diff(SW2, SW1) div 1000]), - io:format( - % user, + maybe_log_toscreen( "~nRE2 Capture filter inside took ~w ms~n", [timer:now_diff(SW3, SW2) div 1000]), - io:format( - % user, + maybe_log_toscreen( "~nRE2 Capture filter outside took ~w ms~n", [timer:now_diff(SW4, SW3) div 1000]), - io:format( - % user, + maybe_log_toscreen( "~nRE2 double-capture filter outside took ~w ms~n", [timer:now_diff(SW5, SW4) div 1000]), - io:format( - % user, - "~nRE2 double-capture filter with parsed query string took ~w ms~n", - [timer:now_diff(SW7, SW6) div 1000]), - io:format( - % user, - "~nRE2 single-capture filter with parsed query string took ~w ms~n", - [timer:now_diff(SW8, SW7) div 1000]), - io:format( - % user, - "~nRE2 single-capture pre-filter with parsed query string took ~w ms~n", + maybe_log_toscreen( + "~nRE2 single-capture filter with parsed filter expression took ~w ms~n", [timer:now_diff(SW9, SW8) div 1000]), + maybe_log_toscreen( + "~nRE2 single-capture pre-filter with parsed query string took ~w ms~n", + [timer:now_diff(SW10, SW9) div 1000]), + maybe_log_toscreen( + "~nDelimited index with parsed filter expression took ~w ms~n", + [timer:now_diff(SW11, SW10) div 1000]), ok = leveled_bookie:book_close(Book1), testutil:reset_filestructure(). +maybe_log_toscreen(Log, Subs) -> + io:format( + % user, + Log, + Subs + ). + count_termsonindex(Bucket, IdxField, Book, QType) -> lists:foldl( @@ -1142,39 +1129,30 @@ count_termsonindex(Bucket, IdxField, Book, QType) -> multibucket_fold(_Config) -> RootPath = testutil:reset_filestructure(), - {ok, Bookie1} = leveled_bookie:book_start(RootPath, - 2000, - 50000000, - testutil:sync_strategy()), - ObjectGen = testutil:get_compressiblevalue_andinteger(), + {ok, Bookie1} = + leveled_bookie:book_start( + RootPath, 2000, 50000000, testutil:sync_strategy()), + ObjectGen = <<"V1">>, IndexGen = fun() -> [] end, - ObjL1 = testutil:generate_objects(13000, - uuid, - [], - ObjectGen, - IndexGen, - {<<"Type1">>, <<"Bucket1">>}), + B1 = {<<"Type1">>, <<"Bucket1">>}, + B2 = <<"Bucket2">>, + B3 = <<"Bucket3">>, + B4 = {<<"Type2">>, <<"Bucket4">>}, + ObjL1 = + testutil:generate_objects( + 13000, uuid, [], ObjectGen, IndexGen, B1), testutil:riakload(Bookie1, ObjL1), - ObjL2 = testutil:generate_objects(17000, - uuid, - [], - ObjectGen, - IndexGen, - <<"Bucket2">>), + ObjL2 = + testutil:generate_objects( + 17000, uuid, [], ObjectGen, IndexGen, B2), testutil:riakload(Bookie1, ObjL2), - ObjL3 = testutil:generate_objects(7000, - uuid, - [], - ObjectGen, - IndexGen, - <<"Bucket3">>), + ObjL3 = + testutil:generate_objects( + 7000, uuid, [], ObjectGen, IndexGen, B3), testutil:riakload(Bookie1, ObjL3), - ObjL4 = testutil:generate_objects(23000, - uuid, - [], - ObjectGen, - IndexGen, - {<<"Type2">>, <<"Bucket4">>}), + ObjL4 = + testutil:generate_objects( + 23000, uuid, [], ObjectGen, IndexGen, B4), testutil:riakload(Bookie1, ObjL4), FF = fun(B, K, _PO, Acc) -> @@ -1241,44 +1219,37 @@ rotating_objects(_Config) -> foldobjects_bybucket_range(_Config) -> RootPath = testutil:reset_filestructure(), - {ok, Bookie1} = leveled_bookie:book_start(RootPath, - 2000, - 50000000, - testutil:sync_strategy()), + {ok, Bookie1} = + leveled_bookie:book_start( + RootPath, 2000, 50000000, testutil:sync_strategy()), ObjectGen = testutil:get_compressiblevalue_andinteger(), IndexGen = fun() -> [] end, - ObjL1 = testutil:generate_objects(1300, - {fixed_binary, 1}, - [], - ObjectGen, - IndexGen, - <<"Bucket1">>), + ObjL1 = + testutil:generate_objects( + 1300, {fixed_binary, 1}, [], ObjectGen, IndexGen, <<"Bucket1">>), testutil:riakload(Bookie1, ObjL1), - FoldKeysFun = fun(_B, K,_V, Acc) -> - [ K |Acc] - end, + FoldKeysFun = + fun(_B, K,_V, Acc) -> [ K |Acc] end, StartKey = testutil:fixed_bin_key(123), EndKey = testutil:fixed_bin_key(779), - {async, Folder} = leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - <<"Bucket1">>, - {StartKey, EndKey}, {FoldKeysFun, []}, - true - ), + {async, Folder} = + leveled_bookie:book_objectfold( + Bookie1, + ?RIAK_TAG, + <<"Bucket1">>, + {StartKey, EndKey}, {FoldKeysFun, []}, + true + ), ResLen = length(Folder()), io:format("Length of Result of folder ~w~n", [ResLen]), true = 657 == ResLen, - {async, AllFolder} = leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - <<"Bucket1">>, - all, - {FoldKeysFun, []}, - true - ), + {async, AllFolder} = + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, <<"Bucket1">>, all, {FoldKeysFun, []}, true), AllResLen = length(AllFolder()), io:format("Length of Result of all keys folder ~w~n", [AllResLen]), diff --git a/test/end_to_end/perf_SUITE.erl b/test/end_to_end/perf_SUITE.erl index 27549f54..a4606292 100644 --- a/test/end_to_end/perf_SUITE.erl +++ b/test/end_to_end/perf_SUITE.erl @@ -11,6 +11,12 @@ -define(MINI_QUERY_DIVISOR, 8). -define(RGEX_QUERY_DIVISOR, 32). +-ifdef(test_filter_expression). + -define(TEST_FE, true). +-else. + -define(TEST_FE, false). +-endif. + -ifndef(performance). -define(performance, riak_ctperf). -endif. @@ -564,8 +570,53 @@ random_queries(Bookie, Bucket, IDs, IdxCnt, MaxRange, IndexesReturned) -> ), TC div 1000. - random_people_queries(Bookie, Bucket, IndexesReturned) -> + random_people_queries(?TEST_FE, Bookie, Bucket, IndexesReturned). + +random_people_queries(true, Bookie, Bucket, IndexesReturned) -> + FilterExpression = + "($dob BETWEEN \"19700101\" AND \"19791231\") " + "AND (contains($gns, \"#Willow\") AND contains($pcs, \"#LS\"))", + {ok, ParsedFilter} = + leveled_filter:generate_filter_expression( + FilterExpression, maps:new()), + FilterFun = + fun(AttrMap) -> leveled_filter:apply_filter(ParsedFilter, AttrMap) end, + QueryFun = + fun() -> + Surname = get_random_surname(), + Range = + {?PEOPLE_INDEX, + Surname, + <> + }, + FoldKeysFun = fun(_B, _K, Cnt) -> Cnt + 1 end, + {async, R} = + leveled_bookie:book_indexfold( + Bookie, + {Bucket, <<>>}, + {FoldKeysFun, 0}, + Range, + {true, + {delimited, + "|", + [<<"surname">>, <<"dob">>, <<"dod">>, <<"gns">>, <<"pcs">>], + FilterFun + } + }), + R() + end, + + {TC, {QC, EF}} = + timer:tc(fun() -> run_queries(QueryFun, 0, 0, IndexesReturned) end), + ct:log( + ?INFO, + "Fetch of ~w index entries by regex in ~w queries took ~w ms" + " with filter_expression=~w", + [EF, QC, TC div 1000, true] + ), + TC div 1000; +random_people_queries(false, Bookie, Bucket, IndexesReturned) -> SeventiesWillowRegex = "[^\\|]*\\|197[0-9]{5}\\|[^\\|]*\\|" "[^\\|]*#Willow[^\\|]*\\|[^\\|]*#LS[^\\|]*", @@ -578,8 +629,7 @@ random_people_queries(Bookie, Bucket, IndexesReturned) -> Surname, <> }, - {ok, TermRegex} = - leveled_util:regex_compile(SeventiesWillowRegex), + {ok, TermRegex} = leveled_util:regex_compile(SeventiesWillowRegex), FoldKeysFun = fun(_B, _K, Cnt) -> Cnt + 1 end, {async, R} = leveled_bookie:book_indexfold( @@ -595,8 +645,9 @@ random_people_queries(Bookie, Bucket, IndexesReturned) -> timer:tc(fun() -> run_queries(QueryFun, 0, 0, IndexesReturned) end), ct:log( ?INFO, - "Fetch of ~w index entries by regex in ~w queries took ~w ms", - [EF, QC, TC div 1000] + "Fetch of ~w index entries by regex in ~w queries took ~w ms" + " with filter_expression=~w", + [EF, QC, TC div 1000, false] ), TC div 1000. From bb7c88942c5cfad520b9e011611e1c5afbe5a32f Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Mon, 29 Apr 2024 21:10:07 +0100 Subject: [PATCH 5/8] Initial lexer/parser for eval pipeline Allow for the generation of projected attributes from an index term --- rebar.config | 2 +- src/leveled_eval.erl | 240 +++++++++++++++++++++++++++++++++++ src/leveled_evallexer.xrl | 33 +++++ src/leveled_evalparser.yrl | 39 ++++++ src/leveled_filter.erl | 32 ++++- src/leveled_filterlexer.xrl | 10 +- src/leveled_filterparser.yrl | 17 +-- 7 files changed, 357 insertions(+), 16 deletions(-) create mode 100644 src/leveled_eval.erl create mode 100644 src/leveled_evallexer.xrl create mode 100644 src/leveled_evalparser.yrl diff --git a/rebar.config b/rebar.config index f2b6ff10..e76eb20d 100644 --- a/rebar.config +++ b/rebar.config @@ -7,7 +7,7 @@ deprecated_function_calls, deprecated_functions]}. {cover_excl_mods, - [leveled_filterlexer, leveled_filterparser, + [leveled_filterlexer, leveled_filterparser, leveled_evallexer, leveled_evalparser, testutil, appdefined_SUITE, basic_SUITE, iterator_SUITE, perf_SUITE, recovery_SUITE, riak_SUITE, tictac_SUITE]}. diff --git a/src/leveled_eval.erl b/src/leveled_eval.erl new file mode 100644 index 00000000..4a6619f5 --- /dev/null +++ b/src/leveled_eval.erl @@ -0,0 +1,240 @@ +%% -------- Eval Functions --------- +%% +%% Support for different eval expressions within leveled +%% + +-module(leveled_eval). + +-export([apply_eval/4]). + +%%%============================================================================ +%%% External API +%%%============================================================================ + +apply_eval({eval, Eval}, Term, Key, AttrMap) -> + apply_eval(Eval, Term, Key, AttrMap); +apply_eval({'PIPE', Eval1, 'INTO', Eval2}, Term, Key, AttrMap) -> + apply_eval(Eval2, Term, Key, apply_eval(Eval1, Term, Key, AttrMap)); +apply_eval({ + delim, {identifier, _, InKey}, {string, _, Delim}, ExpKeys}, + Term, Key, AttrMap) -> + TermToSplit = term_to_process(InKey, Term, Key, AttrMap), + CptTerms = string:split(TermToSplit, Delim, all), + L = min(length(CptTerms), length(ExpKeys)), + maps:merge( + AttrMap, + maps:from_list( + lists:zip(lists:sublist(ExpKeys, L), lists:sublist(CptTerms, L))) + ); +apply_eval( + {join, InKeys, {string, _, Delim}, {identifier, _, OutKey}}, + _Term, _Key, AttrMap) -> + NewTerm = + unicode:characters_to_binary( + lists:join( + Delim, + lists:map( + fun(InKey) -> maps:get(InKey, AttrMap, <<"">>) end, + InKeys) + ) + ), + maps:put(OutKey, NewTerm, AttrMap); +apply_eval({ + split, {identifier, _, InKey}, {string, _, Delim}, {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + TermToSplit = term_to_process(InKey, Term, Key, AttrMap), + TermList = string:split(TermToSplit, Delim, all), + maps:put(OutKey, TermList, AttrMap); +apply_eval( + {slice, {identifier, _, InKey}, {integer, _, Width}, {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + TermToSlice = term_to_process(InKey, Term, Key, AttrMap), + TermCount = string:length(TermToSlice) div Width, + TermList = + lists:map( + fun(S) -> string:slice(TermToSlice, S, Width) end, + lists:map(fun(I) -> Width * I end, lists:seq(0, TermCount - 1))), + maps:put(OutKey, TermList, AttrMap); +apply_eval( + {index, {identifier, _, InKey}, + {integer, _, Start}, {integer, _, Length}, + {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + TermToIndex = term_to_process(InKey, Term, Key, AttrMap), + case string:length(TermToIndex) of + L when L >= (Start + Length) -> + maps:put( + OutKey, string:slice(TermToIndex, Start, Length), AttrMap); + _ -> + AttrMap + end; +apply_eval( + {to_integer, {identifier, _, InKey}, {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + TermToConvert = term_to_process(InKey, Term, Key, AttrMap), + case string:to_integer(TermToConvert) of + {I, _Rest} when is_integer(I) -> + maps:put(OutKey, I, AttrMap); + _ -> + AttrMap + end. + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +term_to_process(<<"term">>, Term, _Key, _AttrMap) -> + Term; +term_to_process(<<"key">>, _Term, Key, _AttrMap) -> + Key; +term_to_process(AttrKey, _Term, _Key, AttrMap) -> + maps:get(AttrKey, AttrMap, <<"">>). + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + +basic_test() -> + EvalString1 = "delim($term, \"|\", ($fn, $dob, $dod, $gns, $pcs))", + EvalString2 = "delim($gns, \"#\", ($gn1, $gn2, $gn3))", + + EvalString3 = EvalString1 ++ " | " ++ EvalString2, + {ok, Tokens3, _EndLine3} = leveled_evallexer:string(EvalString3), + {ok, ParsedExp3} = leveled_evalparser:parse(Tokens3), + EvalOut3 = + apply_eval( + ParsedExp3, + <<"SMITH|19861216||Willow#Mia|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut3)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut3)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut3)), + ?assertMatch(<<"Willow#Mia">>, maps:get(<<"gns">>, EvalOut3)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut3)), + ?assertMatch(<<"Willow">>, maps:get(<<"gn1">>, EvalOut3)), + ?assertMatch(<<"Mia">>, maps:get(<<"gn2">>, EvalOut3)), + ?assertNot(maps:is_key(<<"gn3">>, EvalOut3)), + + + EvalString4 = EvalString3 ++ " | join(($dob, $fn), \"|\", $dobfn)", + {ok, Tokens4, _EndLine4} = leveled_evallexer:string(EvalString4), + {ok, ParsedExp4} = leveled_evalparser:parse(Tokens4), + EvalOut4 = + apply_eval( + ParsedExp4, + <<"SMITH|19861216||Willow#Mia|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut4)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut4)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut4)), + ?assertMatch(<<"Willow#Mia">>, maps:get(<<"gns">>, EvalOut4)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut4)), + ?assertMatch(<<"Willow">>, maps:get(<<"gn1">>, EvalOut4)), + ?assertMatch(<<"Mia">>, maps:get(<<"gn2">>, EvalOut4)), + ?assertNot(maps:is_key(<<"gn3">>, EvalOut4)), + ?assertMatch(<<"19861216|SMITH">>, maps:get(<<"dobfn">>, EvalOut4)), + + + EvalString5 = EvalString4 ++ " | index($dob, 0, 4, $yob) | to_integer($yob, $yob)", + {ok, Tokens5, _EndLine5} = leveled_evallexer:string(EvalString5), + {ok, ParsedExp5} = leveled_evalparser:parse(Tokens5), + EvalOut5 = + apply_eval( + ParsedExp5, + <<"SMITH|19861216||Willow#Mia|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut5)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut5)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut5)), + ?assertMatch(<<"Willow#Mia">>, maps:get(<<"gns">>, EvalOut5)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut5)), + ?assertMatch(<<"Willow">>, maps:get(<<"gn1">>, EvalOut5)), + ?assertMatch(<<"Mia">>, maps:get(<<"gn2">>, EvalOut5)), + ?assertNot(maps:is_key(<<"gn3">>, EvalOut5)), + ?assertMatch(<<"19861216|SMITH">>, maps:get(<<"dobfn">>, EvalOut5)), + ?assertMatch(1986, maps:get(<<"yob">>, EvalOut5)), + + EvalString6 = EvalString1 ++ " | slice($gns, 2, $gns)", + {ok, Tokens6, _EndLine6} = leveled_evallexer:string(EvalString6), + {ok, ParsedExp6} = leveled_evalparser:parse(Tokens6), + EvalOut6 = + apply_eval( + ParsedExp6, + <<"SMITH|19861216||MAN1Ve|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut6)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut6)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut6)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut6)), + ?assertMatch([<<"MA">>, <<"N1">>, <<"Ve">>], maps:get(<<"gns">>, EvalOut6)), + + EvalOut7 = + apply_eval( + ParsedExp6, + <<"SMITH|19861216||MAN1VeZ|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut7)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut7)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut7)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut7)), + ?assertMatch([<<"MA">>, <<"N1">>, <<"Ve">>], maps:get(<<"gns">>, EvalOut7)), + + EvalString8 = EvalString1 ++ " | split($gns, \"#\", $gns)", + {ok, Tokens8, _EndLine8} = leveled_evallexer:string(EvalString8), + {ok, ParsedExp8} = leveled_evalparser:parse(Tokens8), + EvalOut8 = + apply_eval( + ParsedExp8, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut8)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut8)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut8)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut8)), + ?assertMatch([<<"Willow">>, <<"Mia">>, <<"Vera">>], maps:get(<<"gns">>, EvalOut8)), + + EvalString9 = + "delim($term, \"|\", ($name, $height, $weight, $pick)) |" + " to_integer($height, $height) |" + " to_integer($weight, $weight) |" + " to_integer($pick, $pick) |" + " delim($key, \"|\", ($team, $number)) |" + " index($team, 0, 9, $doh)", + {ok, Tokens9, _EndLine9} = leveled_evallexer:string(EvalString9), + {ok, ParsedExp9} = leveled_evalparser:parse(Tokens9), + EvalOut9 = + apply_eval( + ParsedExp9, + <<"WEMBANYAMA|224cm|95kg|#1">>, + <<"SPURS|00001">>, + maps:new() + ), + ?assertMatch(<<"WEMBANYAMA">>, maps:get(<<"name">>, EvalOut9)), + ?assertMatch(224, maps:get(<<"height">>, EvalOut9)), + ?assertMatch(95, maps:get(<<"weight">>, EvalOut9)), + ?assertMatch(<<"#1">>, maps:get(<<"pick">>, EvalOut9)), + % Not changes as not starting with integer + ?assertMatch(<<"SPURS">>, maps:get(<<"team">>, EvalOut9)), + ?assertMatch(<<"00001">>, maps:get(<<"number">>, EvalOut9)), + ?assertNot(maps:is_key(<<"doh">>, EvalOut9)) + . + + + +-endif. \ No newline at end of file diff --git a/src/leveled_evallexer.xrl b/src/leveled_evallexer.xrl new file mode 100644 index 00000000..29e65fbf --- /dev/null +++ b/src/leveled_evallexer.xrl @@ -0,0 +1,33 @@ +%% Lexer for eval expressions + +Definitions. +WhiteSpace = ([\t\f\v\r\n\s]+) + +Rules. + +{WhiteSpace} : skip_token. + +\( : {token, {'(', TokenLine}}. +\) : {token, {')', TokenLine}}. +, : {token, {',', TokenLine}}. +\| : {token, {'PIPE', TokenLine}}. + +delim : {token, {delim, TokenLine}}. +join : {token, {join, TokenLine}}. +split : {token, {split, TokenLine}}. +slice : {token, {slice, TokenLine}}. +index : {token, {index, TokenLine}}. +to_integer : {token, {to_integer, TokenLine}}. + +\$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}. +\"[^"]*\" : {token, {string, TokenLine, strip_string(TokenChars)}}. %" +[0-9]+ : {token, {integer, TokenLine, list_to_integer(TokenChars)}}. + +Erlang code. + +strip_string(TokenChars) -> + list_to_binary(lists:sublist(TokenChars, 2, length(TokenChars) -2)). + +strip_identifier(TokenChars) -> + [36|StrippedChars] = TokenChars, + list_to_binary(StrippedChars). \ No newline at end of file diff --git a/src/leveled_evalparser.yrl b/src/leveled_evalparser.yrl new file mode 100644 index 00000000..43a1b2ab --- /dev/null +++ b/src/leveled_evalparser.yrl @@ -0,0 +1,39 @@ +%% Grammar for eval expressions + +Nonterminals +top_level eval identifiers identifier_list. + +Terminals +'(' ')' ',' +identifier string integer +'PIPE' +delim join split slice index to_integer. + +Rootsymbol top_level. + +top_level -> eval: {eval, '$1'}. + +eval -> eval 'PIPE' eval : {'PIPE', '$1', 'INTO', '$3'}. +eval -> delim '(' identifier ',' string ',' identifier_list ')' : {delim, '$3', '$5', '$7'}. +eval -> join '(' identifier_list ',' string ',' identifier ')' : {join, '$3', '$5', '$7'}. +eval -> split '(' identifier ',' string ',' identifier ')' : {split, '$3', '$5', '$7'}. +eval -> slice '(' identifier ',' integer ',' identifier ')' : {slice, '$3', '$5', '$7'}. +eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')' : {index, '$3', '$5', '$7', '$9'}. +eval -> to_integer '(' identifier ',' 'identifier' ')' : {to_integer, '$3', '$5'}. + +identifier_list -> '(' identifiers ')' : strip_ids('$2'). + +identifiers -> identifier ',' identifiers : ['$1' | '$3']. +identifiers -> identifier : ['$1']. + +Endsymbol '$end'. + +Right 100 'PIPE'. + +Erlang code. + +strip_ids(IDL) -> + lists:map( + fun(ID) -> element(3, ID) end, + lists:flatten(IDL) + ). \ No newline at end of file diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl index 43fb1573..0e9515fd 100644 --- a/src/leveled_filter.erl +++ b/src/leveled_filter.erl @@ -36,7 +36,14 @@ apply_filter({'BETWEEN', {identifier, _, ID}, CompA, CompB}, AttrMap) -> V >= Low andalso V =< High end end; -apply_filter({'IN', {identifier, _, ID}, CheckList}, AttrMap) -> +apply_filter({'IN', {string, _, TestString}, {identifier, _, ID}}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + CheckList when is_list(CheckList) -> + lists:member(TestString, CheckList); + _ -> + false + end; +apply_filter({'IN', {identifier, _, ID}, CheckList}, AttrMap) when is_list(CheckList) -> case maps:get(ID, AttrMap, notfound) of notfound -> false; @@ -233,6 +240,12 @@ filterexpression_test() -> {ok, Filter5A} = generate_filter_expression(FE5A, maps:new()), ?assert(apply_filter(Filter5A, M5)), ?assertNot(apply_filter(Filter5A, M6)), + FE5B = + "$dob >= \"19740301\" AND $dob <= \"19761030\"" + " AND $dod = \"20221216\"", + {ok, Filter5B} = generate_filter_expression(FE5B, maps:new()), + ?assert(apply_filter(Filter5B, M5)), + ?assertNot(apply_filter(Filter5B, M6)), FE6 = "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")" @@ -335,7 +348,22 @@ filterexpression_test() -> ?assertNot( apply_filter( Filter13, - #{<<"dob">> => <<"19440812">>, <<"dod">> => <<"20240213">>})) + #{<<"dob">> => <<"19440812">>, <<"dod">> => <<"20240213">>})), + + FE14 = "\"M1\" IN $gns", + {ok, Filter14} = generate_filter_expression(FE14, maps:new()), + ?assert( + apply_filter( + Filter14, + #{<<"gns">> => [<<"MA">>, <<"M1">>, <<"A0">>]})), + ?assertNot( + apply_filter( + Filter14, + #{<<"gns">> => [<<"MA">>, <<"M2">>, <<"A0">>]})), + ?assertNot( + apply_filter( + Filter14, + #{<<"gns">> => <<"M1">>})) . -endif. diff --git a/src/leveled_filterlexer.xrl b/src/leveled_filterlexer.xrl index 9fea88ff..6d779f41 100644 --- a/src/leveled_filterlexer.xrl +++ b/src/leveled_filterlexer.xrl @@ -24,11 +24,11 @@ IN : {token, {'IN', TokenLine}}. <= : {token, {comparator, '<=', TokenLine}}. >= : {token, {comparator, '>=', TokenLine}}. -contains : {token, {'contains', TokenLine}}. -begins_with : {token, {'begins_with', TokenLine}}. -attribute_exists : {token, {'attribute_exists', TokenLine}}. -attribute_not_exists : {token, {'attribute_not_exists', TokenLine}}. -attribute_empty : {token, {'attribute_empty', TokenLine}}. +contains : {token, {contains, TokenLine}}. +begins_with : {token, {begins_with, TokenLine}}. +attribute_exists : {token, {attribute_exists, TokenLine}}. +attribute_not_exists : {token, {attribute_not_exists, TokenLine}}. +attribute_empty : {token, {attribute_empty, TokenLine}}. \$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}. \:[a-zA-Z_][a-zA-Z_0-9]* : {token, {substitution, TokenLine, strip_substitution(TokenChars)}}. diff --git a/src/leveled_filterparser.yrl b/src/leveled_filterparser.yrl index d9253590..ca060eb9 100644 --- a/src/leveled_filterparser.yrl +++ b/src/leveled_filterparser.yrl @@ -9,8 +9,8 @@ Terminals '(' ')' comparator identifier string integer ',' 'NOT' 'AND' 'OR' 'IN' 'BETWEEN' -'contains' 'begins_with' -'attribute_exists' 'attribute_not_exists' 'attribute_empty'. +contains begins_with +attribute_exists attribute_not_exists attribute_empty. Rootsymbol top_level. @@ -18,17 +18,18 @@ Rootsymbol top_level. top_level -> condition: {condition, '$1'}. condition -> operand comparator operand : {'$2', '$1', '$3'}. -condition -> operand 'BETWEEN' string 'AND' string : {'BETWEEN', '$1', '$3', '$5'}. +condition -> operand 'BETWEEN' string 'AND' string : {'BETWEEN', '$1', '$3', '$5'}. condition -> operand 'BETWEEN' integer 'AND' integer : {'BETWEEN', '$1', '$3', '$5'}. condition -> operand 'IN' operand_list : {'IN', '$1', '$3'}. +condition -> operand 'IN' identifier : {'IN', '$1', '$3'}. condition -> condition 'AND' condition : {'AND', '$1', '$3'}. condition -> condition 'OR' condition : {'OR', '$1', '$3'}. condition -> 'NOT' condition : {'NOT', '$2'}. -condition -> 'contains' '(' identifier ',' string ')' : {'contains', '$3', '$5'}. -condition -> 'begins_with' '(' identifier ',' string ')' : {'begins_with', '$3', '$5'}. -condition -> 'attribute_exists' '(' identifier ')' : {'attribute_exists', '$3'}. -condition -> 'attribute_not_exists' '(' identifier ')' : {'attribute_not_exists', '$3'}. -condition -> 'attribute_empty' '(' identifier ')' : {'attribute_empty', '$3'}. +condition -> contains '(' identifier ',' string ')' : {contains, '$3', '$5'}. +condition -> begins_with '(' identifier ',' string ')' : {begins_with, '$3', '$5'}. +condition -> attribute_exists '(' identifier ')' : {attribute_exists, '$3'}. +condition -> attribute_not_exists '(' identifier ')' : {attribute_not_exists, '$3'}. +condition -> attribute_empty '(' identifier ')' : {attribute_empty, '$3'}. condition -> '(' condition ')' : '$2'. operand -> identifier : '$1'. From 80ce3c7f1a6feacbf19fb579a9703f0d4bb01079 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 30 Apr 2024 16:27:22 +0100 Subject: [PATCH 6/8] Update and extend the eval expression Allow for additional functions in eval. Change the leveled query API to generalise application of eval/filter expressions. --- src/leveled_codec.erl | 56 +++---- src/leveled_eval.erl | 253 ++++++++++++++++++++++++++++- src/leveled_evallexer.xrl | 39 +++-- src/leveled_evalparser.yrl | 45 +++-- src/leveled_filter.erl | 25 +-- test/end_to_end/iterator_SUITE.erl | 34 ++-- test/end_to_end/perf_SUITE.erl | 15 +- 7 files changed, 381 insertions(+), 86 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 9270668e..7122a275 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -114,10 +114,14 @@ lookup|no_lookup. -type actual_regex() :: reference()|{re_pattern, term(), term(), term(), term()}. --type capture_filter_fun() :: fun((#{atom() => string()}) -> boolean()). +-type capture_value() :: binary()|integer(). +-type capture_filter_fun() :: + fun((#{binary() => capture_value()}) -> boolean()). +-type capture_eval_fun() :: + fun((binary(), binary()) -> #{binary() => capture_value()}). -type capture_reference() :: - {capture, actual_regex(), list(atom()|binary()), capture_filter_fun()}| - {delimited, string(), list(binary()), capture_filter_fun()}. + {capture, actual_regex(), list(binary()), capture_filter_fun()}| + {eval, capture_eval_fun(), capture_filter_fun()}. -type regular_expression() :: actual_regex()|undefined|capture_reference(). @@ -316,35 +320,28 @@ accumulate_index( case leveled_util:regex_run(IdxValue, TermRegex, Opts) of {match, CptTerms} -> L = min(length(CptTerms), ExpectedKeys), + CptMap = + maps:from_list( + lists:zip( + lists:sublist(CptKeys, L), + lists:sublist(CptTerms, L))), check_captured_terms( - lists:sublist(CptTerms, L), - lists:sublist(CptKeys, L), - FilterFun, - AddTerm, - FoldKeysFun, - Bucket, - IdxValue, - ObjKey, + CptMap, + FilterFun, AddTerm, FoldKeysFun, + Bucket, IdxValue, ObjKey, Acc); _ -> Acc end end; accumulate_index( - {AddTerm, {delimited, Delim, CptKeys, FilterFun}}, FoldKeysFun) -> - ExpectedKeys = length(CptKeys), + {AddTerm, {eval, EvalFun, FilterFun}}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> - CptTerms = string:split(IdxValue, Delim, all), - L = min(length(CptTerms), ExpectedKeys), + CptMap = EvalFun(IdxValue, ObjKey), check_captured_terms( - lists:sublist(CptTerms, L), - lists:sublist(CptKeys, L), - FilterFun, - AddTerm, - FoldKeysFun, - Bucket, - IdxValue, - ObjKey, + CptMap, + FilterFun, AddTerm, FoldKeysFun, + Bucket, IdxValue, ObjKey, Acc) end; accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> @@ -363,8 +360,7 @@ accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> end. check_captured_terms( - CptTerms, CptKeys, FilterFun, AddTerm, FoldKeysFun, B, IdxValue, ObjKey, Acc) -> - CptMap = maps:from_list(lists:zip(CptKeys, CptTerms)), + CptMap, FilterFun, AddTerm, FoldKeysFun, B, IdxValue, ObjKey, Acc) -> case FilterFun(CptMap) of true -> case AddTerm of @@ -372,9 +368,13 @@ check_captured_terms( FoldKeysFun(B, {IdxValue, ObjKey}, Acc); false -> FoldKeysFun(B, ObjKey, Acc); - CptKey when is_atom(CptKey) -> - CptValue = maps:get(CptKey, CptMap), - FoldKeysFun(B, {CptValue, ObjKey}, Acc) + CptKey when is_binary(CptKey) -> + case maps:get(CptKey, CptMap, undefined) of + undefined -> + Acc; + CptValue -> + FoldKeysFun(B, {CptValue, ObjKey}, Acc) + end end; false -> Acc diff --git a/src/leveled_eval.erl b/src/leveled_eval.erl index 4a6619f5..3f9e8614 100644 --- a/src/leveled_eval.erl +++ b/src/leveled_eval.erl @@ -5,12 +5,13 @@ -module(leveled_eval). --export([apply_eval/4]). +-export([apply_eval/4, generate_eval_expression/2]). %%%============================================================================ %%% External API %%%============================================================================ + apply_eval({eval, Eval}, Term, Key, AttrMap) -> apply_eval(Eval, Term, Key, AttrMap); apply_eval({'PIPE', Eval1, 'INTO', Eval2}, Term, Key, AttrMap) -> @@ -40,14 +41,15 @@ apply_eval( ), maps:put(OutKey, NewTerm, AttrMap); apply_eval({ - split, {identifier, _, InKey}, {string, _, Delim}, {identifier, _, OutKey}}, + split, {identifier, _, InKey}, DelimAttr, {identifier, _, OutKey}}, Term, Key, AttrMap) -> TermToSplit = term_to_process(InKey, Term, Key, AttrMap), - TermList = string:split(TermToSplit, Delim, all), + TermList = string:split(TermToSplit, element(3, DelimAttr), all), maps:put(OutKey, TermList, AttrMap); apply_eval( - {slice, {identifier, _, InKey}, {integer, _, Width}, {identifier, _, OutKey}}, + {slice, {identifier, _, InKey}, WidthAttr, {identifier, _, OutKey}}, Term, Key, AttrMap) -> + {integer, _, Width} = WidthAttr, TermToSlice = term_to_process(InKey, Term, Key, AttrMap), TermCount = string:length(TermToSlice) div Width, TermList = @@ -68,6 +70,24 @@ apply_eval( _ -> AttrMap end; +apply_eval( + {kvsplit, + {identifier, _, InKey}, + {string, _, DelimPair}, {string, _, DelimKV}}, + Term, Key, AttrMap) -> + TermToSplit = term_to_process(InKey, Term, Key, AttrMap), + lists:foldl( + fun(S, AccMap) -> + case string:split(S, DelimKV, all) of + [K, V] -> + maps:put(K, V, AccMap); + _ -> + AccMap + end + end, + AttrMap, + string:split(TermToSplit, DelimPair, all) + ); apply_eval( {to_integer, {identifier, _, InKey}, {identifier, _, OutKey}}, Term, Key, AttrMap) -> @@ -77,12 +97,66 @@ apply_eval( maps:put(OutKey, I, AttrMap); _ -> AttrMap + end; +apply_eval( + {to_string, {identifier, _, InKey}, {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + case term_to_process(InKey, Term, Key, AttrMap) of + TermToConvert when is_integer(TermToConvert) -> + maps:put( + OutKey, + list_to_binary(integer_to_list(TermToConvert)), + AttrMap); + _ -> + AttrMap + end; +apply_eval( + {map, InID, Comparator, MapList, Default, OutID}, + Term, Key, AttrMap) -> + {identifier, _, InKey} = InID, + {identifier, _, OutKey} = OutID, + TermToCompare = term_to_process(InKey, Term, Key, AttrMap), + F = reverse_compare_mapping(element(2, Comparator), TermToCompare), + case lists:dropwhile(F, MapList) of + [] -> + maps:put(OutKey, element(3, Default), AttrMap); + [{mapping, _T, Assignment}|_Rest] -> + maps:put(OutKey, element(3, Assignment), AttrMap) + end; +apply_eval( + {MathOp, OperandX, OperandY, {identifier, _, OutKey}}, + _Term, _Key, AttrMap) + when MathOp == add; MathOp == subtract -> + X = maybe_fetch_operand(OperandX, AttrMap), + Y = maybe_fetch_operand(OperandY, AttrMap), + case MathOp of + add when is_integer(X), is_integer(Y) -> + maps:put(OutKey, X + Y, AttrMap); + subtract when is_integer(X), is_integer(Y) -> + maps:put(OutKey, X - Y, AttrMap); + _ -> + AttrMap + end. + +generate_eval_expression(EvalString, Substitutions) -> + {ok, Tokens, _EndLine} = leveled_evallexer:string(EvalString), + case leveled_filter:substitute_items(Tokens, Substitutions, []) of + {error, Error} -> + {error, Error}; + UpdTokens -> + leveled_evalparser:parse(UpdTokens) end. + %%%============================================================================ %%% Internal functions %%%============================================================================ +maybe_fetch_operand({identifier, _, ID}, AttrMap) -> + maps:get(ID, AttrMap, 0); +maybe_fetch_operand(Op, _AttrMap) -> + element(3, Op). + term_to_process(<<"term">>, Term, _Key, _AttrMap) -> Term; term_to_process(<<"key">>, _Term, Key, _AttrMap) -> @@ -90,6 +164,17 @@ term_to_process(<<"key">>, _Term, Key, _AttrMap) -> term_to_process(AttrKey, _Term, _Key, AttrMap) -> maps:get(AttrKey, AttrMap, <<"">>). +reverse_compare_mapping('<', Term) -> + fun({mapping, T, _A}) -> Term >= element(3, T) end; +reverse_compare_mapping('<=', Term) -> + fun({mapping, T, _A}) -> Term > element(3, T) end; +reverse_compare_mapping('>', Term) -> + fun({mapping, T, _A}) -> Term =< element(3, T) end; +reverse_compare_mapping('>=', Term) -> + fun({mapping, T, _A}) -> Term < element(3, T) end; +reverse_compare_mapping('=', Term) -> + fun({mapping, T, _A}) -> Term =/= element(3, T) end. + %%%============================================================================ %%% Test %%%============================================================================ @@ -232,7 +317,165 @@ basic_test() -> % Not changes as not starting with integer ?assertMatch(<<"SPURS">>, maps:get(<<"team">>, EvalOut9)), ?assertMatch(<<"00001">>, maps:get(<<"number">>, EvalOut9)), - ?assertNot(maps:is_key(<<"doh">>, EvalOut9)) + ?assertNot(maps:is_key(<<"doh">>, EvalOut9)), + + %% Age at 30 April 2024 + EvalString10 = + EvalString5 ++ + " | index($dob, 4, 4, $birthday)" + " | map($birthday, <=, ((\"0430\", 2024)), 2023, $yoc)" + " | subtract($yoc, $yob, $age)" + " | add($age, 1, $age_next)" + " | to_string($age, $age)" + , + {ok, Tokens10, _EndLine10} = leveled_evallexer:string(EvalString10), + {ok, ParsedExp10} = leveled_evalparser:parse(Tokens10), + EvalOut10A = + apply_eval( + ParsedExp10, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"37">>, maps:get(<<"age">>, EvalOut10A)), + ?assertMatch(38, maps:get(<<"age_next">>, EvalOut10A)), + EvalOut10B = + apply_eval( + ParsedExp10, + <<"SMITH|19860216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"38">>, maps:get(<<"age">>, EvalOut10B)), + EvalString10F = + EvalString1 ++ + " | index($dob, 0, 4, $yob)" + " | index($dob, 4, 4, $birthday)" + " | map($birthday, <=, ((\"0430\", 2024)), 2023, $yoc)" + " | subtract($yoc, $yob, $age)" + % yob has not been converted to an integer, + % so the age will not be set + " | to_string($age, $age)" + , + {ok, Tokens10F, _EndLine10F} = leveled_evallexer:string(EvalString10F), + {ok, ParsedExp10F} = leveled_evalparser:parse(Tokens10F), + EvalOut10F = + apply_eval( + ParsedExp10F, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertNot(maps:is_key(<<"age">>, EvalOut10F)), + + EvalString11A = + EvalString1 ++ + " | map($dob, <, " + "((\"1946\", \"Silent\"), (\"1966\", \"Boomer\")," + "(\"1980\", \"GenX\"), (\"1997\", \"Millenial\")), \"GenZ\"," + " $generation)", + {ok, Tokens11A, _EndLine11A} = leveled_evallexer:string(EvalString11A), + {ok, ParsedExp11A} = leveled_evalparser:parse(Tokens11A), + EvalOut11A = + apply_eval( + ParsedExp11A, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11A)), + EvalString11B = + EvalString1 ++ + " | map($dob, <=, " + "((\"1945\", \"Silent\"), (\"1965\", \"Boomer\")," + "(\"1979\", \"GenX\"), (\"1996\", \"Millenial\")), \"GenZ\"," + " $generation)", + {ok, Tokens11B, _EndLine11B} = leveled_evallexer:string(EvalString11B), + {ok, ParsedExp11B} = leveled_evalparser:parse(Tokens11B), + EvalOut11B = + apply_eval( + ParsedExp11B, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11B)), + EvalString11C = + EvalString1 ++ + " | map($dob, >, " + "((\"1996\", \"GenZ\"), (\"1979\", \"Millenial\")," + "(\"1965\", \"GenX\"), (\"1945\", \"Boomer\")), \"Silent\"," + " $generation)", + {ok, Tokens11C, _EndLine11C} = leveled_evallexer:string(EvalString11C), + {ok, ParsedExp11C} = leveled_evalparser:parse(Tokens11C), + EvalOut11C = + apply_eval( + ParsedExp11C, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11C)), + EvalString11D = + EvalString1 ++ + " | map($dob, >=, " + "((\"1997\", \"GenZ\"), (\"1980\", \"Millenial\")," + "(\"1966\", \"GenX\"), (\"1946\", \"Boomer\")), \"Silent\"," + " $generation)", + {ok, Tokens11D, _EndLine11D} = leveled_evallexer:string(EvalString11D), + {ok, ParsedExp11D} = leveled_evalparser:parse(Tokens11D), + EvalOut11D = + apply_eval( + ParsedExp11D, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11D)), + + EvalString12 = + "kvsplit($term, \"|\", \"=\") | index($term, 0, 12, $ts) |" + " to_integer($ts, $ts) |" + " to_integer($DEBUG, $DEBUG) |" + " to_integer($INFO, $INFO) |" + " to_integer($WARN, $WARN) |" + " to_integer($ERROR, $ERROR) |" + " to_integer($CRITICAL, $CRITICAL) |" + " add($DEBUG, $INFO, $TOTAL) |" + " add($TOTAL, $WARN, $TOTAL) |" + " add($TOTAL, $ERROR, $TOTAL) |" + " add($TOTAL, $CRITICAL, $TOTAL)" + , + {ok, Tokens12, _EndLine12} = leveled_evallexer:string(EvalString12), + {ok, ParsedExp12} = leveled_evalparser:parse(Tokens12), + EvalOut12 = + apply_eval( + ParsedExp12, + <<"063881703147|DEBUG=804|INFO=186|WARN=10">>, + <<"ABC1233">>, + maps:new() + ), + ?assertMatch(63881703147, maps:get(<<"ts">>, EvalOut12)), + ?assertMatch(1000, maps:get(<<"TOTAL">>, EvalOut12)), + ?assertNot(maps:is_key(<<"CRITICAL">>, EvalOut12)), + + EvalString13 = + "kvsplit($term, \"|\", \":\") |" + " map($cup_year, =, " + "((\"1965\", \"bad\"), (\"1970\", \"bad\"), " + "(\"1972\", \"good\"), (\"1974\", \"bad\")), " + "\"indifferent\", $cup_happy) ", + {ok, Tokens13, _EndLine13} = leveled_evallexer:string(EvalString13), + {ok, ParsedExp13} = leveled_evalparser:parse(Tokens13), + EvalOut13A = + apply_eval(ParsedExp13, <<"cup_year:1972">>, <<"ABC1">>, maps:new()), + ?assertMatch(<<"good">>, maps:get(<<"cup_happy">>, EvalOut13A)), + EvalOut13B = + apply_eval(ParsedExp13, <<"cup_year:1970">>, <<"ABC1">>, maps:new()), + ?assertMatch(<<"bad">>, maps:get(<<"cup_happy">>, EvalOut13B)), + EvalOut13C = + apply_eval(ParsedExp13, <<"cup_year:2024">>, <<"ABC1">>, maps:new()), + ?assertMatch(<<"indifferent">>, maps:get(<<"cup_happy">>, EvalOut13C)) . diff --git a/src/leveled_evallexer.xrl b/src/leveled_evallexer.xrl index 29e65fbf..dcee00fc 100644 --- a/src/leveled_evallexer.xrl +++ b/src/leveled_evallexer.xrl @@ -7,19 +7,32 @@ Rules. {WhiteSpace} : skip_token. -\( : {token, {'(', TokenLine}}. -\) : {token, {')', TokenLine}}. -, : {token, {',', TokenLine}}. -\| : {token, {'PIPE', TokenLine}}. - -delim : {token, {delim, TokenLine}}. -join : {token, {join, TokenLine}}. -split : {token, {split, TokenLine}}. -slice : {token, {slice, TokenLine}}. -index : {token, {index, TokenLine}}. -to_integer : {token, {to_integer, TokenLine}}. +\( : {token, {'(', TokenLine}}. +\) : {token, {')', TokenLine}}. +, : {token, {',', TokenLine}}. +\| : {token, {'PIPE', TokenLine}}. + +delim : {token, {delim, TokenLine}}. +join : {token, {join, TokenLine}}. +split : {token, {split, TokenLine}}. +slice : {token, {slice, TokenLine}}. +index : {token, {index, TokenLine}}. +kvsplit : {token, {kvsplit, TokenLine}}. +to_integer : {token, {to_integer, TokenLine}}. +to_string : {token, {to_string, TokenLine}}. +add : {token, {add, TokenLine}}. +subtract : {token, {subtract, TokenLine}}. +map : {token, {map, TokenLine}}. + += : {token, {comparator, '=', TokenLine}}. +< : {token, {comparator, '<', TokenLine}}. +> : {token, {comparator, '>', TokenLine}}. +<> : {token, {comparator, '<>', TokenLine}}. +<= : {token, {comparator, '<=', TokenLine}}. +>= : {token, {comparator, '>=', TokenLine}}. \$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}. +\:[a-zA-Z_][a-zA-Z_0-9]* : {token, {substitution, TokenLine, strip_substitution(TokenChars)}}. \"[^"]*\" : {token, {string, TokenLine, strip_string(TokenChars)}}. %" [0-9]+ : {token, {integer, TokenLine, list_to_integer(TokenChars)}}. @@ -30,4 +43,8 @@ strip_string(TokenChars) -> strip_identifier(TokenChars) -> [36|StrippedChars] = TokenChars, + list_to_binary(StrippedChars). + +strip_substitution(TokenChars) -> + [58|StrippedChars] = TokenChars, list_to_binary(StrippedChars). \ No newline at end of file diff --git a/src/leveled_evalparser.yrl b/src/leveled_evalparser.yrl index 43a1b2ab..be475980 100644 --- a/src/leveled_evalparser.yrl +++ b/src/leveled_evalparser.yrl @@ -1,27 +1,50 @@ %% Grammar for eval expressions Nonterminals -top_level eval identifiers identifier_list. +top_level eval +operand math_operand +mapping mappings mappings_list +identifiers identifier_list. Terminals '(' ')' ',' -identifier string integer +identifier string integer comparator 'PIPE' -delim join split slice index to_integer. +delim join split slice index kvsplit map +add subtract +to_integer to_string. Rootsymbol top_level. top_level -> eval: {eval, '$1'}. -eval -> eval 'PIPE' eval : {'PIPE', '$1', 'INTO', '$3'}. -eval -> delim '(' identifier ',' string ',' identifier_list ')' : {delim, '$3', '$5', '$7'}. -eval -> join '(' identifier_list ',' string ',' identifier ')' : {join, '$3', '$5', '$7'}. -eval -> split '(' identifier ',' string ',' identifier ')' : {split, '$3', '$5', '$7'}. -eval -> slice '(' identifier ',' integer ',' identifier ')' : {slice, '$3', '$5', '$7'}. -eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')' : {index, '$3', '$5', '$7', '$9'}. -eval -> to_integer '(' identifier ',' 'identifier' ')' : {to_integer, '$3', '$5'}. +eval -> eval 'PIPE' eval : {'PIPE', '$1', 'INTO', '$3'}. +eval -> delim '(' identifier ',' string ',' identifier_list ')' : {delim, '$3', '$5', '$7'}. +eval -> join '(' identifier_list ',' string ',' identifier ')' : {join, '$3', '$5', '$7'}. +eval -> split '(' identifier ',' string ',' identifier ')' : {split, '$3', '$5', '$7'}. +eval -> slice '(' identifier ',' integer ',' identifier ')' : {slice, '$3', '$5', '$7'}. +eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')' : {index, '$3', '$5', '$7', '$9'}. +eval -> kvsplit '(' identifier ',' string ',' string ')' : {kvsplit, '$3', '$5', '$7'}. +eval -> map '(' identifier ',' comparator ',' mappings_list ',' operand ',' identifier ')' : {map, '$3', '$5', '$7', '$9', '$11'}. +eval -> to_integer '(' identifier ',' identifier ')' : {to_integer, '$3', '$5'}. +eval -> to_string '(' identifier ',' identifier ')' : {to_string, '$3', '$5'}. +eval -> subtract '(' math_operand ',' math_operand ',' identifier ')' : {subtract, '$3', '$5', '$7'}. +eval -> add '(' math_operand ',' math_operand ',' identifier ')' : {add, '$3', '$5', '$7'}. -identifier_list -> '(' identifiers ')' : strip_ids('$2'). +mappings_list -> '(' mappings ')' : '$2'. + +mappings -> mapping ',' mappings : ['$1' | '$3']. +mappings -> mapping : ['$1']. + +mapping -> '(' operand ',' operand ')' : {mapping, '$2', '$4'}. + +operand -> string : '$1'. +operand -> integer : '$1'. + +math_operand -> integer : '$1'. +math_operand -> identifier : '$1'. + +identifier_list -> '(' identifiers ')' : strip_ids('$2'). identifiers -> identifier ',' identifiers : ['$1' | '$3']. identifiers -> identifier : ['$1']. diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl index 0e9515fd..5d17627f 100644 --- a/src/leveled_filter.erl +++ b/src/leveled_filter.erl @@ -8,7 +8,8 @@ -export( [ generate_filter_expression/2, - apply_filter/2 + apply_filter/2, + substitute_items/3 ]). %%%============================================================================ @@ -110,17 +111,6 @@ generate_filter_expression(FilterString, Substitutions) -> leveled_filterparser:parse(UpdTokens) end. -%%%============================================================================ -%%% Internal functions -%%%============================================================================ - -compare('>', V, CmpA) -> V > CmpA; -compare('>=', V, CmpA) -> V >= CmpA; -compare('<', V, CmpA) -> V < CmpA; -compare('<=', V, CmpA) -> V =< CmpA; -compare('=', V, CmpA) -> V == CmpA; -compare('<>', V, CmpA) -> V =/= CmpA. - substitute_items([], _Subs, UpdTokens) -> lists:reverse(UpdTokens); substitute_items([{substitution, LN, ID}|Rest], Subs, UpdTokens) -> @@ -142,6 +132,17 @@ substitute_items([{substitution, LN, ID}|Rest], Subs, UpdTokens) -> substitute_items([Token|Rest], Subs, UpdTokens) -> substitute_items(Rest, Subs, [Token|UpdTokens]). +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +compare('>', V, CmpA) -> V > CmpA; +compare('>=', V, CmpA) -> V >= CmpA; +compare('<', V, CmpA) -> V < CmpA; +compare('<=', V, CmpA) -> V =< CmpA; +compare('=', V, CmpA) -> V == CmpA; +compare('<>', V, CmpA) -> V =/= CmpA. + %%%============================================================================ %%% Test diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index 955c036e..be0cb71b 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -879,7 +879,7 @@ capture_and_filter_terms(_Config) -> "[^\\|]*#LS[^\\|]*", FilterFun1 = fun(Captures) -> - DoB = maps:get(dob, Captures), + DoB = maps:get(<<"dob">>, Captures), (DoB >= StartDoB) andalso (DoB =< EndDoB) end, {ok, WillowLeedsExtractorPCRE} = re:compile(WillowLeedsExtractor), @@ -889,7 +889,7 @@ capture_and_filter_terms(_Config) -> {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {false, - {capture, WillowLeedsExtractorPCRE, [dob], FilterFun1}} + {capture, WillowLeedsExtractorPCRE, [<<"dob">>], FilterFun1}} }, {async, RunnerPCRE1} = leveled_bookie:book_returnfolder(Book1, QueryPCRE1), BornMid70sPCRE1 = RunnerPCRE1(), @@ -903,20 +903,21 @@ capture_and_filter_terms(_Config) -> {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {false, - {capture, WillowLeedsExtractorRE2, [dob], FilterFun1}} + {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}} }, {async, RunnerRE2_2} = leveled_bookie:book_returnfolder(Book1, QueryRE2_2), BornMid70sRE2_2 = RunnerRE2_2(), SW3 = os:timestamp(), + AllFun = fun(_) -> true end, QueryRE2_3 = {index_query, {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {dob, - {capture, WillowLeedsExtractorRE2, [dob], fun(_) -> true end}} + {<<"dob">>, + {capture, WillowLeedsExtractorRE2, [<<"dob">>], AllFun}} }, {async, RunnerRE2_3} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3), Results3 = RunnerRE2_3(), @@ -941,7 +942,7 @@ capture_and_filter_terms(_Config) -> FilterFun2 = fun(Captures) -> - DoB = maps:get(dob, Captures), + DoB = maps:get(<<"dob">>, Captures), (DoB >= StartDoB) andalso (DoB =< EndDoB) end, QueryRE2_4 = @@ -952,7 +953,7 @@ capture_and_filter_terms(_Config) -> {false, {capture, element(2, re2:compile(WillowLeedsDoubleExtractor)), - [dob, dod], + [<<"dob">>, <<"dod">>], FilterFun2}} }, {async, RunnerRE2_4} = leveled_bookie:book_returnfolder(Book1, QueryRE2_4), @@ -966,7 +967,7 @@ capture_and_filter_terms(_Config) -> {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {true, - {capture, WillowLeedsExtractorRE2, [dob], FilterFun1}} + {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}} }, {async, RunnerRE2_5} = leveled_bookie:book_returnfolder(Book1, QueryRE2_5), BornMid70sRE2_5 = @@ -1039,16 +1040,23 @@ capture_and_filter_terms(_Config) -> fun(AttrMap) -> leveled_filter:apply_filter(FilterExpressionParsed2, AttrMap) end, - + {ok, EvalExpression2} = + leveled_eval:generate_eval_expression( + "delim($term, \"|\", ($surname, $dob, $dod, $gns, $pcs))", + maps:new() + ), + EvalFun2 = + fun(Term, Key) -> + leveled_eval:apply_eval(EvalExpression2, Term, Key, maps:new()) + end, QueryRE2_10 = {index_query, {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, {false, - {delimited, - "|", - [<<"surname">>, <<"dob">>, <<"dod">>, <<"gns">>, <<"pcs">>], + {eval, + EvalFun2, FilterFun6}} }, {async, RunnerRE2_10} = leveled_bookie:book_returnfolder(Book1, QueryRE2_10), @@ -1089,7 +1097,7 @@ capture_and_filter_terms(_Config) -> "~nRE2 single-capture pre-filter with parsed query string took ~w ms~n", [timer:now_diff(SW10, SW9) div 1000]), maybe_log_toscreen( - "~nDelimited index with parsed filter expression took ~w ms~n", + "~nEval processed index with parsed filter expression took ~w ms~n", [timer:now_diff(SW11, SW10) div 1000]), ok = leveled_bookie:book_close(Book1), diff --git a/test/end_to_end/perf_SUITE.erl b/test/end_to_end/perf_SUITE.erl index a4606292..e4c54117 100644 --- a/test/end_to_end/perf_SUITE.erl +++ b/test/end_to_end/perf_SUITE.erl @@ -582,6 +582,14 @@ random_people_queries(true, Bookie, Bucket, IndexesReturned) -> FilterExpression, maps:new()), FilterFun = fun(AttrMap) -> leveled_filter:apply_filter(ParsedFilter, AttrMap) end, + EvalExpression = "delim($term, \"|\", ($surname, $dob, $dod, $gns, $pcs))", + {ok, ParsedEval} = + leveled_eval:generate_eval_expression(EvalExpression, maps:new()), + EvalFun = + fun(Term, Key) -> + leveled_eval:apply_eval(ParsedEval, Term, Key, maps:new()) + end, + QueryFun = fun() -> Surname = get_random_surname(), @@ -597,12 +605,7 @@ random_people_queries(true, Bookie, Bucket, IndexesReturned) -> {Bucket, <<>>}, {FoldKeysFun, 0}, Range, - {true, - {delimited, - "|", - [<<"surname">>, <<"dob">>, <<"dod">>, <<"gns">>, <<"pcs">>], - FilterFun - } + {true, {eval, EvalFun, FilterFun} }), R() end, From 1563a7c90f21285ccffb57850f08058086e7470f Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Tue, 30 Apr 2024 21:46:29 +0100 Subject: [PATCH 7/8] Extend testing --- src/leveled_eval.erl | 17 +++++++++++++++++ src/leveled_filter.erl | 2 +- test/end_to_end/iterator_SUITE.erl | 12 ++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/leveled_eval.erl b/src/leveled_eval.erl index 3f9e8614..8084af2a 100644 --- a/src/leveled_eval.erl +++ b/src/leveled_eval.erl @@ -479,5 +479,22 @@ basic_test() -> . +generate_test() -> + EvalString13 = + "kvsplit($term, \"|\", \":\") |" + " map($cup_year, =, " + "((\"1965\", \"bad\"), (\"1970\", \"bad\"), " + "(:clarke, \"good\"), (\"1974\", \"bad\")), " + "\"indifferent\", $cup_happy) ", + {ok, ParsedExp13} = + generate_eval_expression(EvalString13, #{<<"clarke">> => <<"1972">>}), + io:format("ParsedExp ~p~n", [ParsedExp13]), + EvalOut13A = + apply_eval(ParsedExp13, <<"cup_year:1972">>, <<"ABC1">>, maps:new()), + ?assertMatch(<<"good">>, maps:get(<<"cup_happy">>, EvalOut13A)), + ?assertMatch( + {error, "Substitution <<\"clarke\">> not found"}, + generate_eval_expression(EvalString13, maps:new()) + ). -endif. \ No newline at end of file diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl index 5d17627f..7761d4f4 100644 --- a/src/leveled_filter.erl +++ b/src/leveled_filter.erl @@ -121,7 +121,7 @@ substitute_items([{substitution, LN, ID}|Rest], Subs, UpdTokens) -> io_lib:format("Substitution ~p not found", [ID]))}; Value when is_binary(Value) -> substitute_items( - Rest, Subs, [{string, LN, binary_to_list(Value)}|UpdTokens]); + Rest, Subs, [{string, LN, Value}|UpdTokens]); Value when is_integer(Value) -> substitute_items(Rest, Subs, [{integer, LN, Value}|UpdTokens]); _UnexpectedValue -> diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index be0cb71b..cb7a5a4a 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -1100,6 +1100,18 @@ capture_and_filter_terms(_Config) -> "~nEval processed index with parsed filter expression took ~w ms~n", [timer:now_diff(SW11, SW10) div 1000]), + QueryRE2_3_WrongCapture = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {<<"gns">>, + {capture, WillowLeedsExtractorRE2, [<<"dob">>], AllFun}} + }, + {async, RunnerRE2_3_WC} = + leveled_bookie:book_returnfolder(Book1, QueryRE2_3_WrongCapture), + true = [] == RunnerRE2_3_WC(), + ok = leveled_bookie:book_close(Book1), testutil:reset_filestructure(). From 59c556afa3ded9ad5f244c2f35226f40acecee5d Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 1 May 2024 16:01:19 +0100 Subject: [PATCH 8/8] Add regex eval function to pipeline Remove `capture` option - as all can be achieved via eval/filter. --- src/leveled_codec.erl | 23 -------- src/leveled_eval.erl | 88 +++++++++++++++++++++++++----- src/leveled_evallexer.xrl | 3 + src/leveled_evalparser.yrl | 45 +++++++++------ src/leveled_util.erl | 10 +++- test/end_to_end/iterator_SUITE.erl | 71 ++++++++++++------------ 6 files changed, 151 insertions(+), 89 deletions(-) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index 7122a275..52a8b9d1 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -120,7 +120,6 @@ -type capture_eval_fun() :: fun((binary(), binary()) -> #{binary() => capture_value()}). -type capture_reference() :: - {capture, actual_regex(), list(binary()), capture_filter_fun()}| {eval, capture_eval_fun(), capture_filter_fun()}. -type regular_expression() :: actual_regex()|undefined|capture_reference(). @@ -312,28 +311,6 @@ accumulate_index({true, undefined}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc) end; -accumulate_index( - {AddTerm, {capture, TermRegex, CptKeys, FilterFun}}, FoldKeysFun) -> - ExpectedKeys = length(CptKeys), - Opts = [{capture, all_but_first, binary}], - fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> - case leveled_util:regex_run(IdxValue, TermRegex, Opts) of - {match, CptTerms} -> - L = min(length(CptTerms), ExpectedKeys), - CptMap = - maps:from_list( - lists:zip( - lists:sublist(CptKeys, L), - lists:sublist(CptTerms, L))), - check_captured_terms( - CptMap, - FilterFun, AddTerm, FoldKeysFun, - Bucket, IdxValue, ObjKey, - Acc); - _ -> - Acc - end - end; accumulate_index( {AddTerm, {eval, EvalFun, FilterFun}}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> diff --git a/src/leveled_eval.erl b/src/leveled_eval.erl index 8084af2a..4f3b3b46 100644 --- a/src/leveled_eval.erl +++ b/src/leveled_eval.erl @@ -5,12 +5,27 @@ -module(leveled_eval). --export([apply_eval/4, generate_eval_expression/2]). +-export([apply_eval/4, generate_eval_expression/2, generate_eval_function/2]). %%%============================================================================ %%% External API %%%============================================================================ +generate_eval_function(EvalString, Substitutions) -> + {ok, ParsedEval} = generate_eval_expression(EvalString, Substitutions), + fun(Term, Key) -> + apply_eval(ParsedEval, Term, Key, maps:new()) + end. + +generate_eval_expression(EvalString, Substitutions) -> + {ok, Tokens, _EndLine} = leveled_evallexer:string(EvalString), + case leveled_filter:substitute_items(Tokens, Substitutions, []) of + {error, Error} -> + {error, Error}; + UpdTokens -> + leveled_evalparser:parse(UpdTokens) + end. + apply_eval({eval, Eval}, Term, Key, AttrMap) -> apply_eval(Eval, Term, Key, AttrMap); @@ -136,17 +151,26 @@ apply_eval( maps:put(OutKey, X - Y, AttrMap); _ -> AttrMap - end. - -generate_eval_expression(EvalString, Substitutions) -> - {ok, Tokens, _EndLine} = leveled_evallexer:string(EvalString), - case leveled_filter:substitute_items(Tokens, Substitutions, []) of - {error, Error} -> - {error, Error}; - UpdTokens -> - leveled_evalparser:parse(UpdTokens) - end. - + end; +apply_eval( + {regex, {identifier, _, InKey}, CompiledRE, ExpKeys}, + Term, Key, AttrMap) -> + TermToCapture = term_to_process(InKey, Term, Key, AttrMap), + ExpectedKeyLength = length(ExpKeys), + Opts = [{capture, all_but_first, binary}], + case leveled_util:regex_run(TermToCapture, CompiledRE, Opts) of + {match, CptTerms} -> + L = min(length(CptTerms), ExpectedKeyLength), + CptMap = + maps:from_list( + lists:zip( + lists:sublist(ExpKeys, L), + lists:sublist(CptTerms, L))), + maps:merge(AttrMap, CptMap); + _ -> + AttrMap + end + . %%%============================================================================ %%% Internal functions @@ -475,10 +499,47 @@ basic_test() -> ?assertMatch(<<"bad">>, maps:get(<<"cup_happy">>, EvalOut13B)), EvalOut13C = apply_eval(ParsedExp13, <<"cup_year:2024">>, <<"ABC1">>, maps:new()), - ?assertMatch(<<"indifferent">>, maps:get(<<"cup_happy">>, EvalOut13C)) + ?assertMatch(<<"indifferent">>, maps:get(<<"cup_happy">>, EvalOut13C)), + + ExtractRegex = + "(?P[^\\|]*)\\|(?P[0-9]{8})\\|(?P[0-9]{0,8})\\|" + "(?P[^\\|]*)\\|(?P[^\\|]*)|.", + ok = + check_regex_eval( + "regex($term, :regex, re2, ($fn, $dob, $dod, $gns, $pcs))", + ExtractRegex + ), + ok = + check_regex_eval( + "regex($term, :regex, pcre, ($fn, $dob, $dod, $gns, $pcs))", + ExtractRegex + ), + ok = + check_regex_eval( + "regex($term, :regex, ($fn, $dob, $dod, $gns, $pcs))", + ExtractRegex + ) + . +check_regex_eval(EvalString14, ExtractRegex) -> + {ok, ParsedExp14} = + generate_eval_expression( + EvalString14, + #{<<"regex">> => list_to_binary(ExtractRegex)} + ), + EvalOut14 = + apply_eval( + ParsedExp14, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut14)), + ok. + + generate_test() -> EvalString13 = "kvsplit($term, \"|\", \":\") |" @@ -488,7 +549,6 @@ generate_test() -> "\"indifferent\", $cup_happy) ", {ok, ParsedExp13} = generate_eval_expression(EvalString13, #{<<"clarke">> => <<"1972">>}), - io:format("ParsedExp ~p~n", [ParsedExp13]), EvalOut13A = apply_eval(ParsedExp13, <<"cup_year:1972">>, <<"ABC1">>, maps:new()), ?assertMatch(<<"good">>, maps:get(<<"cup_happy">>, EvalOut13A)), diff --git a/src/leveled_evallexer.xrl b/src/leveled_evallexer.xrl index dcee00fc..5944445c 100644 --- a/src/leveled_evallexer.xrl +++ b/src/leveled_evallexer.xrl @@ -18,11 +18,14 @@ split : {token, {split, TokenLine}}. slice : {token, {slice, TokenLine}}. index : {token, {index, TokenLine}}. kvsplit : {token, {kvsplit, TokenLine}}. +regex : {token, {regex, TokenLine}}. to_integer : {token, {to_integer, TokenLine}}. to_string : {token, {to_string, TokenLine}}. add : {token, {add, TokenLine}}. subtract : {token, {subtract, TokenLine}}. map : {token, {map, TokenLine}}. +pcre : {token, {pcre, TokenLine}}. +re2 : {token, {re2, TokenLine}}. = : {token, {comparator, '=', TokenLine}}. < : {token, {comparator, '<', TokenLine}}. diff --git a/src/leveled_evalparser.yrl b/src/leveled_evalparser.yrl index be475980..58384aa4 100644 --- a/src/leveled_evalparser.yrl +++ b/src/leveled_evalparser.yrl @@ -2,7 +2,7 @@ Nonterminals top_level eval -operand math_operand +operand math_operand regex_method mapping mappings mappings_list identifiers identifier_list. @@ -10,26 +10,29 @@ Terminals '(' ')' ',' identifier string integer comparator 'PIPE' -delim join split slice index kvsplit map +delim join split slice index kvsplit regex map add subtract -to_integer to_string. +to_integer to_string +pcre re2. Rootsymbol top_level. top_level -> eval: {eval, '$1'}. -eval -> eval 'PIPE' eval : {'PIPE', '$1', 'INTO', '$3'}. -eval -> delim '(' identifier ',' string ',' identifier_list ')' : {delim, '$3', '$5', '$7'}. -eval -> join '(' identifier_list ',' string ',' identifier ')' : {join, '$3', '$5', '$7'}. -eval -> split '(' identifier ',' string ',' identifier ')' : {split, '$3', '$5', '$7'}. -eval -> slice '(' identifier ',' integer ',' identifier ')' : {slice, '$3', '$5', '$7'}. -eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')' : {index, '$3', '$5', '$7', '$9'}. -eval -> kvsplit '(' identifier ',' string ',' string ')' : {kvsplit, '$3', '$5', '$7'}. -eval -> map '(' identifier ',' comparator ',' mappings_list ',' operand ',' identifier ')' : {map, '$3', '$5', '$7', '$9', '$11'}. -eval -> to_integer '(' identifier ',' identifier ')' : {to_integer, '$3', '$5'}. -eval -> to_string '(' identifier ',' identifier ')' : {to_string, '$3', '$5'}. -eval -> subtract '(' math_operand ',' math_operand ',' identifier ')' : {subtract, '$3', '$5', '$7'}. -eval -> add '(' math_operand ',' math_operand ',' identifier ')' : {add, '$3', '$5', '$7'}. +eval -> eval 'PIPE' eval : {'PIPE', '$1', 'INTO', '$3'}. +eval -> delim '(' identifier ',' string ',' identifier_list ')' : {delim, '$3', '$5', '$7'}. +eval -> join '(' identifier_list ',' string ',' identifier ')' : {join, '$3', '$5', '$7'}. +eval -> split '(' identifier ',' string ',' identifier ')' : {split, '$3', '$5', '$7'}. +eval -> slice '(' identifier ',' integer ',' identifier ')' : {slice, '$3', '$5', '$7'}. +eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')' : {index, '$3', '$5', '$7', '$9'}. +eval -> kvsplit '(' identifier ',' string ',' string ')' : {kvsplit, '$3', '$5', '$7'}. +eval -> regex '(' identifier ',' string ',' regex_method ',' identifier_list ')' : {regex, '$3', re_compile('$5', '$7'), '$9'}. +eval -> regex '(' identifier ',' string ',' identifier_list ')' : {regex, '$3', re_compile('$5'), '$7'}. +eval -> map '(' identifier ',' comparator ',' mappings_list ',' operand ',' identifier ')' : {map, '$3', '$5', '$7', '$9', '$11'}. +eval -> to_integer '(' identifier ',' identifier ')' : {to_integer, '$3', '$5'}. +eval -> to_string '(' identifier ',' identifier ')' : {to_string, '$3', '$5'}. +eval -> subtract '(' math_operand ',' math_operand ',' identifier ')' : {subtract, '$3', '$5', '$7'}. +eval -> add '(' math_operand ',' math_operand ',' identifier ')' : {add, '$3', '$5', '$7'}. mappings_list -> '(' mappings ')' : '$2'. @@ -44,6 +47,9 @@ operand -> integer : '$1'. math_operand -> integer : '$1'. math_operand -> identifier : '$1'. +regex_method -> pcre : '$1'. +regex_method -> re2 : '$1'. + identifier_list -> '(' identifiers ')' : strip_ids('$2'). identifiers -> identifier ',' identifiers : ['$1' | '$3']. @@ -59,4 +65,11 @@ strip_ids(IDL) -> lists:map( fun(ID) -> element(3, ID) end, lists:flatten(IDL) - ). \ No newline at end of file + ). + +re_compile(RegexStr) -> + re_compile(RegexStr, {re2, element(2, RegexStr)}). + +re_compile({string, _LN, Regex}, Method) -> + {ok, CRE} = leveled_util:regex_compile(Regex, element(1, Method)), + CRE. \ No newline at end of file diff --git a/src/leveled_util.erl b/src/leveled_util.erl index 9f82fdac..fdf32ae5 100644 --- a/src/leveled_util.erl +++ b/src/leveled_util.erl @@ -12,7 +12,8 @@ t2b/1, safe_rename/4, regex_run/3, - regex_compile/1 + regex_compile/1, + regex_compile/2 ]). -define(WRITE_OPS, [binary, raw, read, write]). @@ -68,7 +69,12 @@ regex_run(Subject, CompiledPCRE, Opts) -> -spec regex_compile(iodata()) -> {ok, reference()}. regex_compile(PlainRegex) -> - re2:compile(PlainRegex). + regex_compile(PlainRegex, re2). + +regex_compile(PlainRegex, re2) -> + re2:compile(PlainRegex); +regex_compile(PlainRegex, pcre) -> + re:compile(PlainRegex). -spec magic_hash(any()) -> 0..16#FFFFFFFF. %% @doc diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index cb7a5a4a..74e71f79 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -844,6 +844,7 @@ capture_and_filter_terms(_Config) -> SW0 = os:timestamp(), {ok, WillowLeedsPCRE} = re:compile(WillowLeedsFinder), + QueryPCRE0 = {index_query, {Bucket, null}, @@ -879,31 +880,38 @@ capture_and_filter_terms(_Config) -> "[^\\|]*#LS[^\\|]*", FilterFun1 = fun(Captures) -> - DoB = maps:get(<<"dob">>, Captures), + DoB = maps:get(<<"dob">>, Captures, notfound), (DoB >= StartDoB) andalso (DoB =< EndDoB) end, - {ok, WillowLeedsExtractorPCRE} = re:compile(WillowLeedsExtractor), + EvalFunPCRE = + leveled_eval:generate_eval_function( + "regex($term, :regex, pcre, ($dob))", + #{<<"regex">> => list_to_binary(WillowLeedsExtractor)} + ), + QueryPCRE1 = {index_query, {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {false, - {capture, WillowLeedsExtractorPCRE, [<<"dob">>], FilterFun1}} + {false, {eval, EvalFunPCRE, FilterFun1}} }, {async, RunnerPCRE1} = leveled_bookie:book_returnfolder(Book1, QueryPCRE1), BornMid70sPCRE1 = RunnerPCRE1(), SW2 = os:timestamp(), - {ok, WillowLeedsExtractorRE2} = re2:compile(WillowLeedsExtractor), + EvalFunRE2 = + leveled_eval:generate_eval_function( + "regex($term, :regex, re2, ($dob))", + #{<<"regex">> => list_to_binary(WillowLeedsExtractor)} + ), QueryRE2_2 = {index_query, {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {false, - {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}} + {false, {eval, EvalFunRE2, FilterFun1}} }, {async, RunnerRE2_2} = leveled_bookie:book_returnfolder(Book1, QueryRE2_2), BornMid70sRE2_2 = RunnerRE2_2(), @@ -916,8 +924,7 @@ capture_and_filter_terms(_Config) -> {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {<<"dob">>, - {capture, WillowLeedsExtractorRE2, [<<"dob">>], AllFun}} + {<<"dob">>, {eval, EvalFunRE2, AllFun}} }, {async, RunnerRE2_3} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3), Results3 = RunnerRE2_3(), @@ -939,10 +946,15 @@ capture_and_filter_terms(_Config) -> WillowLeedsDoubleExtractor = "[^\\|]*\\|(?P[0-9]{8})\\|(?P[0-9]{0,8})\\|" "[^\\|]*#Willow[^\\|]*\\|[^\\|]*#LS[^\\|]*", - + EvalFunRE2_2 = + leveled_eval:generate_eval_function( + "regex($term, :regex, re2, ($dob, $dod))", + #{<<"regex">> => list_to_binary(WillowLeedsDoubleExtractor)} + ), + FilterFun2 = fun(Captures) -> - DoB = maps:get(<<"dob">>, Captures), + DoB = maps:get(<<"dob">>, Captures, notfound), (DoB >= StartDoB) andalso (DoB =< EndDoB) end, QueryRE2_4 = @@ -950,11 +962,7 @@ capture_and_filter_terms(_Config) -> {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {false, - {capture, - element(2, re2:compile(WillowLeedsDoubleExtractor)), - [<<"dob">>, <<"dod">>], - FilterFun2}} + {false, {eval, EvalFunRE2_2, FilterFun2}} }, {async, RunnerRE2_4} = leveled_bookie:book_returnfolder(Book1, QueryRE2_4), BornMid70sRE2_4 = RunnerRE2_4(), @@ -966,10 +974,10 @@ capture_and_filter_terms(_Config) -> {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {true, - {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}} + {true, {eval, EvalFunRE2, FilterFun1}} }, {async, RunnerRE2_5} = leveled_bookie:book_returnfolder(Book1, QueryRE2_5), + {ok, WillowLeedsExtractorRE2} = re2:compile(WillowLeedsExtractor), BornMid70sRE2_5 = lists:filtermap( fun({T, K}) -> @@ -995,11 +1003,7 @@ capture_and_filter_terms(_Config) -> {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {false, - {capture, - element(2, re2:compile(WillowLeedsExtractor)), - [<<"dob">>], - FilterFun5}} + {false, {eval, EvalFunRE2, FilterFun5}} }, {async, RunnerRE2_8} = leveled_bookie:book_returnfolder(Book1, QueryRE2_8), BornMid70sRE2_8 = RunnerRE2_8(), @@ -1010,16 +1014,18 @@ capture_and_filter_terms(_Config) -> "[^\\|]*\\|(?P197[4-6]{1}[0-9]{4})\\|" "[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" "[^\\|]*#LS[^\\|]*", + PreFilterEvalFun = + leveled_eval:generate_eval_function( + "regex($term, :regex, re2, ($dob))", + #{<<"regex">> => list_to_binary(PreFilterRE)} + ), + QueryRE2_9 = {index_query, {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {false, - {capture, - element(2, re2:compile(PreFilterRE)), - [<<"dob">>], - FilterFun5}} + {false, {eval, PreFilterEvalFun, FilterFun5}} }, {async, RunnerRE2_9} = leveled_bookie:book_returnfolder(Book1, QueryRE2_9), BornMid70sRE2_9 = RunnerRE2_9(), @@ -1054,10 +1060,7 @@ capture_and_filter_terms(_Config) -> {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {false, - {eval, - EvalFun2, - FilterFun6}} + {false, {eval, EvalFun2, FilterFun6}} }, {async, RunnerRE2_10} = leveled_bookie:book_returnfolder(Book1, QueryRE2_10), BornMid70sRE2_10 = RunnerRE2_10(), @@ -1100,13 +1103,13 @@ capture_and_filter_terms(_Config) -> "~nEval processed index with parsed filter expression took ~w ms~n", [timer:now_diff(SW11, SW10) div 1000]), + QueryRE2_3_WrongCapture = {index_query, {Bucket, null}, {fun testutil:foldkeysfun/3, []}, {IdxName, <<"M">>, <<"Z">>}, - {<<"gns">>, - {capture, WillowLeedsExtractorRE2, [<<"dob">>], AllFun}} + {<<"gns">>, {eval, EvalFunRE2, FilterFun6}} }, {async, RunnerRE2_3_WC} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3_WrongCapture),