diff --git a/.gitignore b/.gitignore index 3906bbcd..2dc6c0c8 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ cover_* .eqc-info leveled_data/* compile_commands.json +*parser.erl +*lexer.erl diff --git a/rebar.config b/rebar.config index e73ff4b7..e76eb20d 100644 --- a/rebar.config +++ b/rebar.config @@ -4,11 +4,11 @@ {xref_checks, [undefined_function_calls,undefined_functions, - locals_not_used, deprecated_function_calls, deprecated_functions]}. {cover_excl_mods, - [testutil, + [leveled_filterlexer, leveled_filterparser, leveled_evallexer, leveled_evalparser, + testutil, appdefined_SUITE, basic_SUITE, iterator_SUITE, perf_SUITE, recovery_SUITE, riak_SUITE, tictac_SUITE]}. @@ -23,6 +23,7 @@ ]}, {test, [{extra_src_dirs, ["test/end_to_end", "test/property"]} ]}, + {perf_fexp, [{erl_opts, [{d, test_filter_expression}]}]}, {perf_full, [{erl_opts, [{d, performance, riak_fullperf}]}]}, {perf_mini, [{erl_opts, [{d, performance, riak_miniperf}]}]}, {perf_prof, [{erl_opts, [{d, performance, riak_profileperf}]}]} diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl index 34a2435f..ade345ca 100644 --- a/src/leveled_bookie.erl +++ b/src/leveled_bookie.erl @@ -696,7 +696,7 @@ book_returnfolder(Pid, RunnerType) -> IndexVal::term(), Start::IndexVal, End::IndexVal, - ReturnTerms::boolean(), + ReturnTerms::boolean()|binary(), TermRegex :: leveled_codec:regular_expression(). book_indexfold(Pid, Constraint, FoldAccT, Range, TermHandling) diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl index fd1ef627..52a8b9d1 100644 --- a/src/leveled_codec.erl +++ b/src/leveled_codec.erl @@ -112,10 +112,17 @@ {index_specs(), infinity|integer()}. % {KeyChanges, TTL} -type maybe_lookup() :: lookup|no_lookup. +-type actual_regex() :: + reference()|{re_pattern, term(), term(), term(), term()}. +-type capture_value() :: binary()|integer(). +-type capture_filter_fun() :: + fun((#{binary() => capture_value()}) -> boolean()). +-type capture_eval_fun() :: + fun((binary(), binary()) -> #{binary() => capture_value()}). +-type capture_reference() :: + {eval, capture_eval_fun(), capture_filter_fun()}. -type regular_expression() :: - {re_pattern, term(), term(), term(), term()}|undefined. - % first element must be re_pattern, but tuple may change legnth with - % versions + actual_regex()|undefined|capture_reference(). -type value_fetcher() :: {fun((pid(), leveled_codec:journal_key()) -> any()), @@ -155,6 +162,7 @@ last_moddate/0, lastmod_range/0, regular_expression/0, + actual_regex/0, value_fetcher/0, proxy_object/0, slimmed_key/0 @@ -292,8 +300,9 @@ maybe_accumulate( maybe_accumulate(T, Acc, Count, Filter, AccFun). -spec accumulate_index( - {boolean(), undefined|leveled_runner:mp()}, leveled_runner:acc_fun()) - -> any(). + {boolean()|binary(), + regular_expression()}, + leveled_runner:acc_fun()) -> any(). accumulate_index({false, undefined}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, _IndexInfo, ObjKey}, _Value, Acc) -> FoldKeysFun(Bucket, ObjKey, Acc) @@ -302,9 +311,19 @@ accumulate_index({true, undefined}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc) end; +accumulate_index( + {AddTerm, {eval, EvalFun, FilterFun}}, FoldKeysFun) -> + fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> + CptMap = EvalFun(IdxValue, ObjKey), + check_captured_terms( + CptMap, + FilterFun, AddTerm, FoldKeysFun, + Bucket, IdxValue, ObjKey, + Acc) + end; accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) -> - case leveled_util:regex_run(IdxValue, TermRegex) of + case leveled_util:regex_run(IdxValue, TermRegex, []) of nomatch -> Acc; _ -> @@ -317,6 +336,29 @@ accumulate_index({AddTerm, TermRegex}, FoldKeysFun) -> end end. +check_captured_terms( + CptMap, FilterFun, AddTerm, FoldKeysFun, B, IdxValue, ObjKey, Acc) -> + case FilterFun(CptMap) of + true -> + case AddTerm of + true -> + FoldKeysFun(B, {IdxValue, ObjKey}, Acc); + false -> + FoldKeysFun(B, ObjKey, Acc); + CptKey when is_binary(CptKey) -> + case maps:get(CptKey, CptMap, undefined) of + undefined -> + Acc; + CptValue -> + FoldKeysFun(B, {CptValue, ObjKey}, Acc) + end + end; + false -> + Acc + end. + + + -spec key_dominates(ledger_kv(), ledger_kv()) -> boolean(). %% @doc %% When comparing two keys in the ledger need to find if one key comes before diff --git a/src/leveled_eval.erl b/src/leveled_eval.erl new file mode 100644 index 00000000..4f3b3b46 --- /dev/null +++ b/src/leveled_eval.erl @@ -0,0 +1,560 @@ +%% -------- Eval Functions --------- +%% +%% Support for different eval expressions within leveled +%% + +-module(leveled_eval). + +-export([apply_eval/4, generate_eval_expression/2, generate_eval_function/2]). + +%%%============================================================================ +%%% External API +%%%============================================================================ + +generate_eval_function(EvalString, Substitutions) -> + {ok, ParsedEval} = generate_eval_expression(EvalString, Substitutions), + fun(Term, Key) -> + apply_eval(ParsedEval, Term, Key, maps:new()) + end. + +generate_eval_expression(EvalString, Substitutions) -> + {ok, Tokens, _EndLine} = leveled_evallexer:string(EvalString), + case leveled_filter:substitute_items(Tokens, Substitutions, []) of + {error, Error} -> + {error, Error}; + UpdTokens -> + leveled_evalparser:parse(UpdTokens) + end. + + +apply_eval({eval, Eval}, Term, Key, AttrMap) -> + apply_eval(Eval, Term, Key, AttrMap); +apply_eval({'PIPE', Eval1, 'INTO', Eval2}, Term, Key, AttrMap) -> + apply_eval(Eval2, Term, Key, apply_eval(Eval1, Term, Key, AttrMap)); +apply_eval({ + delim, {identifier, _, InKey}, {string, _, Delim}, ExpKeys}, + Term, Key, AttrMap) -> + TermToSplit = term_to_process(InKey, Term, Key, AttrMap), + CptTerms = string:split(TermToSplit, Delim, all), + L = min(length(CptTerms), length(ExpKeys)), + maps:merge( + AttrMap, + maps:from_list( + lists:zip(lists:sublist(ExpKeys, L), lists:sublist(CptTerms, L))) + ); +apply_eval( + {join, InKeys, {string, _, Delim}, {identifier, _, OutKey}}, + _Term, _Key, AttrMap) -> + NewTerm = + unicode:characters_to_binary( + lists:join( + Delim, + lists:map( + fun(InKey) -> maps:get(InKey, AttrMap, <<"">>) end, + InKeys) + ) + ), + maps:put(OutKey, NewTerm, AttrMap); +apply_eval({ + split, {identifier, _, InKey}, DelimAttr, {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + TermToSplit = term_to_process(InKey, Term, Key, AttrMap), + TermList = string:split(TermToSplit, element(3, DelimAttr), all), + maps:put(OutKey, TermList, AttrMap); +apply_eval( + {slice, {identifier, _, InKey}, WidthAttr, {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + {integer, _, Width} = WidthAttr, + TermToSlice = term_to_process(InKey, Term, Key, AttrMap), + TermCount = string:length(TermToSlice) div Width, + TermList = + lists:map( + fun(S) -> string:slice(TermToSlice, S, Width) end, + lists:map(fun(I) -> Width * I end, lists:seq(0, TermCount - 1))), + maps:put(OutKey, TermList, AttrMap); +apply_eval( + {index, {identifier, _, InKey}, + {integer, _, Start}, {integer, _, Length}, + {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + TermToIndex = term_to_process(InKey, Term, Key, AttrMap), + case string:length(TermToIndex) of + L when L >= (Start + Length) -> + maps:put( + OutKey, string:slice(TermToIndex, Start, Length), AttrMap); + _ -> + AttrMap + end; +apply_eval( + {kvsplit, + {identifier, _, InKey}, + {string, _, DelimPair}, {string, _, DelimKV}}, + Term, Key, AttrMap) -> + TermToSplit = term_to_process(InKey, Term, Key, AttrMap), + lists:foldl( + fun(S, AccMap) -> + case string:split(S, DelimKV, all) of + [K, V] -> + maps:put(K, V, AccMap); + _ -> + AccMap + end + end, + AttrMap, + string:split(TermToSplit, DelimPair, all) + ); +apply_eval( + {to_integer, {identifier, _, InKey}, {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + TermToConvert = term_to_process(InKey, Term, Key, AttrMap), + case string:to_integer(TermToConvert) of + {I, _Rest} when is_integer(I) -> + maps:put(OutKey, I, AttrMap); + _ -> + AttrMap + end; +apply_eval( + {to_string, {identifier, _, InKey}, {identifier, _, OutKey}}, + Term, Key, AttrMap) -> + case term_to_process(InKey, Term, Key, AttrMap) of + TermToConvert when is_integer(TermToConvert) -> + maps:put( + OutKey, + list_to_binary(integer_to_list(TermToConvert)), + AttrMap); + _ -> + AttrMap + end; +apply_eval( + {map, InID, Comparator, MapList, Default, OutID}, + Term, Key, AttrMap) -> + {identifier, _, InKey} = InID, + {identifier, _, OutKey} = OutID, + TermToCompare = term_to_process(InKey, Term, Key, AttrMap), + F = reverse_compare_mapping(element(2, Comparator), TermToCompare), + case lists:dropwhile(F, MapList) of + [] -> + maps:put(OutKey, element(3, Default), AttrMap); + [{mapping, _T, Assignment}|_Rest] -> + maps:put(OutKey, element(3, Assignment), AttrMap) + end; +apply_eval( + {MathOp, OperandX, OperandY, {identifier, _, OutKey}}, + _Term, _Key, AttrMap) + when MathOp == add; MathOp == subtract -> + X = maybe_fetch_operand(OperandX, AttrMap), + Y = maybe_fetch_operand(OperandY, AttrMap), + case MathOp of + add when is_integer(X), is_integer(Y) -> + maps:put(OutKey, X + Y, AttrMap); + subtract when is_integer(X), is_integer(Y) -> + maps:put(OutKey, X - Y, AttrMap); + _ -> + AttrMap + end; +apply_eval( + {regex, {identifier, _, InKey}, CompiledRE, ExpKeys}, + Term, Key, AttrMap) -> + TermToCapture = term_to_process(InKey, Term, Key, AttrMap), + ExpectedKeyLength = length(ExpKeys), + Opts = [{capture, all_but_first, binary}], + case leveled_util:regex_run(TermToCapture, CompiledRE, Opts) of + {match, CptTerms} -> + L = min(length(CptTerms), ExpectedKeyLength), + CptMap = + maps:from_list( + lists:zip( + lists:sublist(ExpKeys, L), + lists:sublist(CptTerms, L))), + maps:merge(AttrMap, CptMap); + _ -> + AttrMap + end + . + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +maybe_fetch_operand({identifier, _, ID}, AttrMap) -> + maps:get(ID, AttrMap, 0); +maybe_fetch_operand(Op, _AttrMap) -> + element(3, Op). + +term_to_process(<<"term">>, Term, _Key, _AttrMap) -> + Term; +term_to_process(<<"key">>, _Term, Key, _AttrMap) -> + Key; +term_to_process(AttrKey, _Term, _Key, AttrMap) -> + maps:get(AttrKey, AttrMap, <<"">>). + +reverse_compare_mapping('<', Term) -> + fun({mapping, T, _A}) -> Term >= element(3, T) end; +reverse_compare_mapping('<=', Term) -> + fun({mapping, T, _A}) -> Term > element(3, T) end; +reverse_compare_mapping('>', Term) -> + fun({mapping, T, _A}) -> Term =< element(3, T) end; +reverse_compare_mapping('>=', Term) -> + fun({mapping, T, _A}) -> Term < element(3, T) end; +reverse_compare_mapping('=', Term) -> + fun({mapping, T, _A}) -> Term =/= element(3, T) end. + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + +basic_test() -> + EvalString1 = "delim($term, \"|\", ($fn, $dob, $dod, $gns, $pcs))", + EvalString2 = "delim($gns, \"#\", ($gn1, $gn2, $gn3))", + + EvalString3 = EvalString1 ++ " | " ++ EvalString2, + {ok, Tokens3, _EndLine3} = leveled_evallexer:string(EvalString3), + {ok, ParsedExp3} = leveled_evalparser:parse(Tokens3), + EvalOut3 = + apply_eval( + ParsedExp3, + <<"SMITH|19861216||Willow#Mia|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut3)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut3)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut3)), + ?assertMatch(<<"Willow#Mia">>, maps:get(<<"gns">>, EvalOut3)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut3)), + ?assertMatch(<<"Willow">>, maps:get(<<"gn1">>, EvalOut3)), + ?assertMatch(<<"Mia">>, maps:get(<<"gn2">>, EvalOut3)), + ?assertNot(maps:is_key(<<"gn3">>, EvalOut3)), + + + EvalString4 = EvalString3 ++ " | join(($dob, $fn), \"|\", $dobfn)", + {ok, Tokens4, _EndLine4} = leveled_evallexer:string(EvalString4), + {ok, ParsedExp4} = leveled_evalparser:parse(Tokens4), + EvalOut4 = + apply_eval( + ParsedExp4, + <<"SMITH|19861216||Willow#Mia|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut4)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut4)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut4)), + ?assertMatch(<<"Willow#Mia">>, maps:get(<<"gns">>, EvalOut4)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut4)), + ?assertMatch(<<"Willow">>, maps:get(<<"gn1">>, EvalOut4)), + ?assertMatch(<<"Mia">>, maps:get(<<"gn2">>, EvalOut4)), + ?assertNot(maps:is_key(<<"gn3">>, EvalOut4)), + ?assertMatch(<<"19861216|SMITH">>, maps:get(<<"dobfn">>, EvalOut4)), + + + EvalString5 = EvalString4 ++ " | index($dob, 0, 4, $yob) | to_integer($yob, $yob)", + {ok, Tokens5, _EndLine5} = leveled_evallexer:string(EvalString5), + {ok, ParsedExp5} = leveled_evalparser:parse(Tokens5), + EvalOut5 = + apply_eval( + ParsedExp5, + <<"SMITH|19861216||Willow#Mia|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut5)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut5)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut5)), + ?assertMatch(<<"Willow#Mia">>, maps:get(<<"gns">>, EvalOut5)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut5)), + ?assertMatch(<<"Willow">>, maps:get(<<"gn1">>, EvalOut5)), + ?assertMatch(<<"Mia">>, maps:get(<<"gn2">>, EvalOut5)), + ?assertNot(maps:is_key(<<"gn3">>, EvalOut5)), + ?assertMatch(<<"19861216|SMITH">>, maps:get(<<"dobfn">>, EvalOut5)), + ?assertMatch(1986, maps:get(<<"yob">>, EvalOut5)), + + EvalString6 = EvalString1 ++ " | slice($gns, 2, $gns)", + {ok, Tokens6, _EndLine6} = leveled_evallexer:string(EvalString6), + {ok, ParsedExp6} = leveled_evalparser:parse(Tokens6), + EvalOut6 = + apply_eval( + ParsedExp6, + <<"SMITH|19861216||MAN1Ve|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut6)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut6)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut6)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut6)), + ?assertMatch([<<"MA">>, <<"N1">>, <<"Ve">>], maps:get(<<"gns">>, EvalOut6)), + + EvalOut7 = + apply_eval( + ParsedExp6, + <<"SMITH|19861216||MAN1VeZ|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut7)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut7)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut7)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut7)), + ?assertMatch([<<"MA">>, <<"N1">>, <<"Ve">>], maps:get(<<"gns">>, EvalOut7)), + + EvalString8 = EvalString1 ++ " | split($gns, \"#\", $gns)", + {ok, Tokens8, _EndLine8} = leveled_evallexer:string(EvalString8), + {ok, ParsedExp8} = leveled_evalparser:parse(Tokens8), + EvalOut8 = + apply_eval( + ParsedExp8, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut8)), + ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut8)), + ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut8)), + ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut8)), + ?assertMatch([<<"Willow">>, <<"Mia">>, <<"Vera">>], maps:get(<<"gns">>, EvalOut8)), + + EvalString9 = + "delim($term, \"|\", ($name, $height, $weight, $pick)) |" + " to_integer($height, $height) |" + " to_integer($weight, $weight) |" + " to_integer($pick, $pick) |" + " delim($key, \"|\", ($team, $number)) |" + " index($team, 0, 9, $doh)", + {ok, Tokens9, _EndLine9} = leveled_evallexer:string(EvalString9), + {ok, ParsedExp9} = leveled_evalparser:parse(Tokens9), + EvalOut9 = + apply_eval( + ParsedExp9, + <<"WEMBANYAMA|224cm|95kg|#1">>, + <<"SPURS|00001">>, + maps:new() + ), + ?assertMatch(<<"WEMBANYAMA">>, maps:get(<<"name">>, EvalOut9)), + ?assertMatch(224, maps:get(<<"height">>, EvalOut9)), + ?assertMatch(95, maps:get(<<"weight">>, EvalOut9)), + ?assertMatch(<<"#1">>, maps:get(<<"pick">>, EvalOut9)), + % Not changes as not starting with integer + ?assertMatch(<<"SPURS">>, maps:get(<<"team">>, EvalOut9)), + ?assertMatch(<<"00001">>, maps:get(<<"number">>, EvalOut9)), + ?assertNot(maps:is_key(<<"doh">>, EvalOut9)), + + %% Age at 30 April 2024 + EvalString10 = + EvalString5 ++ + " | index($dob, 4, 4, $birthday)" + " | map($birthday, <=, ((\"0430\", 2024)), 2023, $yoc)" + " | subtract($yoc, $yob, $age)" + " | add($age, 1, $age_next)" + " | to_string($age, $age)" + , + {ok, Tokens10, _EndLine10} = leveled_evallexer:string(EvalString10), + {ok, ParsedExp10} = leveled_evalparser:parse(Tokens10), + EvalOut10A = + apply_eval( + ParsedExp10, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"37">>, maps:get(<<"age">>, EvalOut10A)), + ?assertMatch(38, maps:get(<<"age_next">>, EvalOut10A)), + EvalOut10B = + apply_eval( + ParsedExp10, + <<"SMITH|19860216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"38">>, maps:get(<<"age">>, EvalOut10B)), + EvalString10F = + EvalString1 ++ + " | index($dob, 0, 4, $yob)" + " | index($dob, 4, 4, $birthday)" + " | map($birthday, <=, ((\"0430\", 2024)), 2023, $yoc)" + " | subtract($yoc, $yob, $age)" + % yob has not been converted to an integer, + % so the age will not be set + " | to_string($age, $age)" + , + {ok, Tokens10F, _EndLine10F} = leveled_evallexer:string(EvalString10F), + {ok, ParsedExp10F} = leveled_evalparser:parse(Tokens10F), + EvalOut10F = + apply_eval( + ParsedExp10F, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertNot(maps:is_key(<<"age">>, EvalOut10F)), + + EvalString11A = + EvalString1 ++ + " | map($dob, <, " + "((\"1946\", \"Silent\"), (\"1966\", \"Boomer\")," + "(\"1980\", \"GenX\"), (\"1997\", \"Millenial\")), \"GenZ\"," + " $generation)", + {ok, Tokens11A, _EndLine11A} = leveled_evallexer:string(EvalString11A), + {ok, ParsedExp11A} = leveled_evalparser:parse(Tokens11A), + EvalOut11A = + apply_eval( + ParsedExp11A, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11A)), + EvalString11B = + EvalString1 ++ + " | map($dob, <=, " + "((\"1945\", \"Silent\"), (\"1965\", \"Boomer\")," + "(\"1979\", \"GenX\"), (\"1996\", \"Millenial\")), \"GenZ\"," + " $generation)", + {ok, Tokens11B, _EndLine11B} = leveled_evallexer:string(EvalString11B), + {ok, ParsedExp11B} = leveled_evalparser:parse(Tokens11B), + EvalOut11B = + apply_eval( + ParsedExp11B, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11B)), + EvalString11C = + EvalString1 ++ + " | map($dob, >, " + "((\"1996\", \"GenZ\"), (\"1979\", \"Millenial\")," + "(\"1965\", \"GenX\"), (\"1945\", \"Boomer\")), \"Silent\"," + " $generation)", + {ok, Tokens11C, _EndLine11C} = leveled_evallexer:string(EvalString11C), + {ok, ParsedExp11C} = leveled_evalparser:parse(Tokens11C), + EvalOut11C = + apply_eval( + ParsedExp11C, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11C)), + EvalString11D = + EvalString1 ++ + " | map($dob, >=, " + "((\"1997\", \"GenZ\"), (\"1980\", \"Millenial\")," + "(\"1966\", \"GenX\"), (\"1946\", \"Boomer\")), \"Silent\"," + " $generation)", + {ok, Tokens11D, _EndLine11D} = leveled_evallexer:string(EvalString11D), + {ok, ParsedExp11D} = leveled_evalparser:parse(Tokens11D), + EvalOut11D = + apply_eval( + ParsedExp11D, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11D)), + + EvalString12 = + "kvsplit($term, \"|\", \"=\") | index($term, 0, 12, $ts) |" + " to_integer($ts, $ts) |" + " to_integer($DEBUG, $DEBUG) |" + " to_integer($INFO, $INFO) |" + " to_integer($WARN, $WARN) |" + " to_integer($ERROR, $ERROR) |" + " to_integer($CRITICAL, $CRITICAL) |" + " add($DEBUG, $INFO, $TOTAL) |" + " add($TOTAL, $WARN, $TOTAL) |" + " add($TOTAL, $ERROR, $TOTAL) |" + " add($TOTAL, $CRITICAL, $TOTAL)" + , + {ok, Tokens12, _EndLine12} = leveled_evallexer:string(EvalString12), + {ok, ParsedExp12} = leveled_evalparser:parse(Tokens12), + EvalOut12 = + apply_eval( + ParsedExp12, + <<"063881703147|DEBUG=804|INFO=186|WARN=10">>, + <<"ABC1233">>, + maps:new() + ), + ?assertMatch(63881703147, maps:get(<<"ts">>, EvalOut12)), + ?assertMatch(1000, maps:get(<<"TOTAL">>, EvalOut12)), + ?assertNot(maps:is_key(<<"CRITICAL">>, EvalOut12)), + + EvalString13 = + "kvsplit($term, \"|\", \":\") |" + " map($cup_year, =, " + "((\"1965\", \"bad\"), (\"1970\", \"bad\"), " + "(\"1972\", \"good\"), (\"1974\", \"bad\")), " + "\"indifferent\", $cup_happy) ", + {ok, Tokens13, _EndLine13} = leveled_evallexer:string(EvalString13), + {ok, ParsedExp13} = leveled_evalparser:parse(Tokens13), + EvalOut13A = + apply_eval(ParsedExp13, <<"cup_year:1972">>, <<"ABC1">>, maps:new()), + ?assertMatch(<<"good">>, maps:get(<<"cup_happy">>, EvalOut13A)), + EvalOut13B = + apply_eval(ParsedExp13, <<"cup_year:1970">>, <<"ABC1">>, maps:new()), + ?assertMatch(<<"bad">>, maps:get(<<"cup_happy">>, EvalOut13B)), + EvalOut13C = + apply_eval(ParsedExp13, <<"cup_year:2024">>, <<"ABC1">>, maps:new()), + ?assertMatch(<<"indifferent">>, maps:get(<<"cup_happy">>, EvalOut13C)), + + ExtractRegex = + "(?P[^\\|]*)\\|(?P[0-9]{8})\\|(?P[0-9]{0,8})\\|" + "(?P[^\\|]*)\\|(?P[^\\|]*)|.", + ok = + check_regex_eval( + "regex($term, :regex, re2, ($fn, $dob, $dod, $gns, $pcs))", + ExtractRegex + ), + ok = + check_regex_eval( + "regex($term, :regex, pcre, ($fn, $dob, $dod, $gns, $pcs))", + ExtractRegex + ), + ok = + check_regex_eval( + "regex($term, :regex, ($fn, $dob, $dod, $gns, $pcs))", + ExtractRegex + ) + + . + + +check_regex_eval(EvalString14, ExtractRegex) -> + {ok, ParsedExp14} = + generate_eval_expression( + EvalString14, + #{<<"regex">> => list_to_binary(ExtractRegex)} + ), + EvalOut14 = + apply_eval( + ParsedExp14, + <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>, + <<"9000000001">>, + maps:new() + ), + ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut14)), + ok. + + +generate_test() -> + EvalString13 = + "kvsplit($term, \"|\", \":\") |" + " map($cup_year, =, " + "((\"1965\", \"bad\"), (\"1970\", \"bad\"), " + "(:clarke, \"good\"), (\"1974\", \"bad\")), " + "\"indifferent\", $cup_happy) ", + {ok, ParsedExp13} = + generate_eval_expression(EvalString13, #{<<"clarke">> => <<"1972">>}), + EvalOut13A = + apply_eval(ParsedExp13, <<"cup_year:1972">>, <<"ABC1">>, maps:new()), + ?assertMatch(<<"good">>, maps:get(<<"cup_happy">>, EvalOut13A)), + ?assertMatch( + {error, "Substitution <<\"clarke\">> not found"}, + generate_eval_expression(EvalString13, maps:new()) + ). + +-endif. \ No newline at end of file diff --git a/src/leveled_evallexer.xrl b/src/leveled_evallexer.xrl new file mode 100644 index 00000000..5944445c --- /dev/null +++ b/src/leveled_evallexer.xrl @@ -0,0 +1,53 @@ +%% Lexer for eval expressions + +Definitions. +WhiteSpace = ([\t\f\v\r\n\s]+) + +Rules. + +{WhiteSpace} : skip_token. + +\( : {token, {'(', TokenLine}}. +\) : {token, {')', TokenLine}}. +, : {token, {',', TokenLine}}. +\| : {token, {'PIPE', TokenLine}}. + +delim : {token, {delim, TokenLine}}. +join : {token, {join, TokenLine}}. +split : {token, {split, TokenLine}}. +slice : {token, {slice, TokenLine}}. +index : {token, {index, TokenLine}}. +kvsplit : {token, {kvsplit, TokenLine}}. +regex : {token, {regex, TokenLine}}. +to_integer : {token, {to_integer, TokenLine}}. +to_string : {token, {to_string, TokenLine}}. +add : {token, {add, TokenLine}}. +subtract : {token, {subtract, TokenLine}}. +map : {token, {map, TokenLine}}. +pcre : {token, {pcre, TokenLine}}. +re2 : {token, {re2, TokenLine}}. + += : {token, {comparator, '=', TokenLine}}. +< : {token, {comparator, '<', TokenLine}}. +> : {token, {comparator, '>', TokenLine}}. +<> : {token, {comparator, '<>', TokenLine}}. +<= : {token, {comparator, '<=', TokenLine}}. +>= : {token, {comparator, '>=', TokenLine}}. + +\$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}. +\:[a-zA-Z_][a-zA-Z_0-9]* : {token, {substitution, TokenLine, strip_substitution(TokenChars)}}. +\"[^"]*\" : {token, {string, TokenLine, strip_string(TokenChars)}}. %" +[0-9]+ : {token, {integer, TokenLine, list_to_integer(TokenChars)}}. + +Erlang code. + +strip_string(TokenChars) -> + list_to_binary(lists:sublist(TokenChars, 2, length(TokenChars) -2)). + +strip_identifier(TokenChars) -> + [36|StrippedChars] = TokenChars, + list_to_binary(StrippedChars). + +strip_substitution(TokenChars) -> + [58|StrippedChars] = TokenChars, + list_to_binary(StrippedChars). \ No newline at end of file diff --git a/src/leveled_evalparser.yrl b/src/leveled_evalparser.yrl new file mode 100644 index 00000000..58384aa4 --- /dev/null +++ b/src/leveled_evalparser.yrl @@ -0,0 +1,75 @@ +%% Grammar for eval expressions + +Nonterminals +top_level eval +operand math_operand regex_method +mapping mappings mappings_list +identifiers identifier_list. + +Terminals +'(' ')' ',' +identifier string integer comparator +'PIPE' +delim join split slice index kvsplit regex map +add subtract +to_integer to_string +pcre re2. + +Rootsymbol top_level. + +top_level -> eval: {eval, '$1'}. + +eval -> eval 'PIPE' eval : {'PIPE', '$1', 'INTO', '$3'}. +eval -> delim '(' identifier ',' string ',' identifier_list ')' : {delim, '$3', '$5', '$7'}. +eval -> join '(' identifier_list ',' string ',' identifier ')' : {join, '$3', '$5', '$7'}. +eval -> split '(' identifier ',' string ',' identifier ')' : {split, '$3', '$5', '$7'}. +eval -> slice '(' identifier ',' integer ',' identifier ')' : {slice, '$3', '$5', '$7'}. +eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')' : {index, '$3', '$5', '$7', '$9'}. +eval -> kvsplit '(' identifier ',' string ',' string ')' : {kvsplit, '$3', '$5', '$7'}. +eval -> regex '(' identifier ',' string ',' regex_method ',' identifier_list ')' : {regex, '$3', re_compile('$5', '$7'), '$9'}. +eval -> regex '(' identifier ',' string ',' identifier_list ')' : {regex, '$3', re_compile('$5'), '$7'}. +eval -> map '(' identifier ',' comparator ',' mappings_list ',' operand ',' identifier ')' : {map, '$3', '$5', '$7', '$9', '$11'}. +eval -> to_integer '(' identifier ',' identifier ')' : {to_integer, '$3', '$5'}. +eval -> to_string '(' identifier ',' identifier ')' : {to_string, '$3', '$5'}. +eval -> subtract '(' math_operand ',' math_operand ',' identifier ')' : {subtract, '$3', '$5', '$7'}. +eval -> add '(' math_operand ',' math_operand ',' identifier ')' : {add, '$3', '$5', '$7'}. + +mappings_list -> '(' mappings ')' : '$2'. + +mappings -> mapping ',' mappings : ['$1' | '$3']. +mappings -> mapping : ['$1']. + +mapping -> '(' operand ',' operand ')' : {mapping, '$2', '$4'}. + +operand -> string : '$1'. +operand -> integer : '$1'. + +math_operand -> integer : '$1'. +math_operand -> identifier : '$1'. + +regex_method -> pcre : '$1'. +regex_method -> re2 : '$1'. + +identifier_list -> '(' identifiers ')' : strip_ids('$2'). + +identifiers -> identifier ',' identifiers : ['$1' | '$3']. +identifiers -> identifier : ['$1']. + +Endsymbol '$end'. + +Right 100 'PIPE'. + +Erlang code. + +strip_ids(IDL) -> + lists:map( + fun(ID) -> element(3, ID) end, + lists:flatten(IDL) + ). + +re_compile(RegexStr) -> + re_compile(RegexStr, {re2, element(2, RegexStr)}). + +re_compile({string, _LN, Regex}, Method) -> + {ok, CRE} = leveled_util:regex_compile(Regex, element(1, Method)), + CRE. \ No newline at end of file diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl new file mode 100644 index 00000000..7761d4f4 --- /dev/null +++ b/src/leveled_filter.erl @@ -0,0 +1,370 @@ +%% -------- Filter Functions --------- +%% +%% Support for different filter expressions within leveled +%% + +-module(leveled_filter). + +-export( + [ + generate_filter_expression/2, + apply_filter/2, + substitute_items/3 + ]). + +%%%============================================================================ +%%% External API +%%%============================================================================ + + +apply_filter({condition, Condition}, AttrMap) -> + apply_filter(Condition, AttrMap); +apply_filter({'OR', P1, P2}, AttrMap) -> + apply_filter(P1, AttrMap) orelse apply_filter(P2, AttrMap); +apply_filter({'AND', P1, P2}, AttrMap) -> + apply_filter(P1, AttrMap) andalso apply_filter(P2, AttrMap); +apply_filter({'NOT', P1}, AttrMap) -> + not apply_filter(P1, AttrMap); +apply_filter({'BETWEEN', {identifier, _, ID}, CompA, CompB}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + notfound -> + false; + V -> + case {element(3, CompA), element(3, CompB)} of + {Low, High} when Low =< High -> + V >= Low andalso V =< High; + {High, Low} -> + V >= Low andalso V =< High + end + end; +apply_filter({'IN', {string, _, TestString}, {identifier, _, ID}}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + CheckList when is_list(CheckList) -> + lists:member(TestString, CheckList); + _ -> + false + end; +apply_filter({'IN', {identifier, _, ID}, CheckList}, AttrMap) when is_list(CheckList) -> + case maps:get(ID, AttrMap, notfound) of + notfound -> + false; + V -> + lists:member(V, lists:map(fun(C) -> element(3, C) end, CheckList)) + end; +apply_filter({{comparator, Cmp, _}, {identifier, _ , ID}, CmpA}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + notfound -> + false; + V -> + case {element(1, CmpA), V} of + {string, V} when is_binary(V) -> + compare(Cmp, V, element(3, CmpA)); + {integer, V} when is_integer(V) -> + compare(Cmp, V, element(3, CmpA)); + _ -> + false + end + end; +apply_filter({contains, {identifier, _, ID}, {string, _ , SubStr}}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + V when is_binary(V) -> + case string:find(V, SubStr) of + nomatch -> + false; + _ -> + true + end; + _ -> + false + end; +apply_filter({begins_with, {identifier, _, ID}, {string, _ , SubStr}}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + V when is_binary(V) -> + case string:prefix(V, SubStr) of + nomatch -> + false; + _ -> + true + end; + _ -> + false + end; +apply_filter({attribute_exists, {identifier, _, ID}}, AttrMap) -> + maps:is_key(ID, AttrMap); +apply_filter({attribute_not_exists, {identifier, _, ID}}, AttrMap) -> + not maps:is_key(ID, AttrMap); +apply_filter({attribute_empty, {identifier, _, ID}}, AttrMap) -> + case maps:get(ID, AttrMap, notfound) of + <<>> -> + true; + _ -> + false + end +. + +generate_filter_expression(FilterString, Substitutions) -> + {ok, Tokens, _EndLine} = leveled_filterlexer:string(FilterString), + case substitute_items(Tokens, Substitutions, []) of + {error, Error} -> + {error, Error}; + UpdTokens -> + leveled_filterparser:parse(UpdTokens) + end. + +substitute_items([], _Subs, UpdTokens) -> + lists:reverse(UpdTokens); +substitute_items([{substitution, LN, ID}|Rest], Subs, UpdTokens) -> + case maps:get(ID, Subs, notfound) of + notfound -> + {error, + lists:flatten( + io_lib:format("Substitution ~p not found", [ID]))}; + Value when is_binary(Value) -> + substitute_items( + Rest, Subs, [{string, LN, Value}|UpdTokens]); + Value when is_integer(Value) -> + substitute_items(Rest, Subs, [{integer, LN, Value}|UpdTokens]); + _UnexpectedValue -> + {error, + lists:flatten( + io_lib:format("Substitution ~p unexpected type", [ID]))} + end; +substitute_items([Token|Rest], Subs, UpdTokens) -> + substitute_items(Rest, Subs, [Token|UpdTokens]). + +%%%============================================================================ +%%% Internal functions +%%%============================================================================ + +compare('>', V, CmpA) -> V > CmpA; +compare('>=', V, CmpA) -> V >= CmpA; +compare('<', V, CmpA) -> V < CmpA; +compare('<=', V, CmpA) -> V =< CmpA; +compare('=', V, CmpA) -> V == CmpA; +compare('<>', V, CmpA) -> V =/= CmpA. + + +%%%============================================================================ +%%% Test +%%%============================================================================ + +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + +invalid_filterexpression_test() -> + FE1 = "($a BETWEEN \"A\" AND \"A12\") OR (($b >= \"30\") AND contains($c, :d))", + SubsMissing = maps:from_list([{<<"a">>, <<"MA">>}]), + ?assertMatch( + {error, "Substitution <<\"d\">> not found"}, + generate_filter_expression(FE1, SubsMissing) + ), + SubsWrongType = maps:from_list([{<<"d">>, "42"}]), + ?assertMatch( + {error, "Substitution <<\"d\">> unexpected type"}, + generate_filter_expression(FE1, SubsWrongType) + ), + SubsPresent = maps:from_list([{<<"d">>, <<"MA">>}]), + FE2 = "($a BETWEEN \"A\" AND 12) OR (($b >= \"30\") AND contains($c, :d))", + ?assertMatch( + {error, {1, leveled_filterparser,["syntax error before: ","12"]}}, + generate_filter_expression(FE2, SubsPresent) + ), + SubsWrongTypeForContains = maps:from_list([{<<"d">>, 42}]), + FE4 = "($a BETWEEN 12 AND 12) OR (($b >= \"30\") AND contains($c, :d))", + ?assertMatch( + {error, {1, leveled_filterparser, ["syntax error before: ","42"]}}, + generate_filter_expression(FE4, SubsWrongTypeForContains) + ). + +filterexpression_test() -> + FE1 = "($a BETWEEN \"A\" AND \"A12\") AND (($b >= 30) AND contains($c, :d))", + SubsPresent = maps:from_list([{<<"d">>, <<"MA">>}]), + {ok, Filter1} = generate_filter_expression(FE1, SubsPresent), + M1 = #{<<"a">> => <<"A11">>, <<"b">> => 100, <<"c">> => <<"CARTMAN">>}, + ?assert(apply_filter(Filter1, M1)), + % ok + + M2 = #{<<"a">> => <<"A11">>, <<"b">> => 10, <<"c">> => <<"CARTMAN">>}, + ?assertNot(apply_filter(Filter1, M2)), + % $b < 30 + + FE2 = "($a BETWEEN \"A\" AND \"A12\") AND (($b >= 30) OR contains($c, :d))", + {ok, Filter2} = generate_filter_expression(FE2, SubsPresent), + ?assert(apply_filter(Filter2, M2)), + % OR used so ($b >= 30) = false is ok + + FE3 = "($a BETWEEN \"A12\" AND \"A\") AND (($b >= 30) OR contains($c, :d))", + {ok, Filter3} = generate_filter_expression(FE3, SubsPresent), + ?assert(apply_filter(Filter3, M2)), + % swapping the low/high - still ok + + M3 = #{<<"a">> => <<"A11">>, <<"b">> => <<"100">>, <<"c">> => <<"CARTMAN">>}, + ?assertNot(apply_filter(Filter1, M3)), + % substitution b is not an integer + + FE4 = + "($dob BETWEEN \"19700101\" AND \"19791231\") " + "AND (contains($gns, \"#Willow\") AND contains($pcs, \"#LS\"))", + {ok, Filter4} = generate_filter_expression(FE4, maps:new()), + M4 = + #{ + <<"dob">> => <<"19751124">>, + <<"gns">> => <<"#Mia#Willow#Chloe">>, + <<"pcs">> => <<"#BD1 1DU#LS1 4BT">> + }, + ?assert(apply_filter(Filter4, M4)), + + FE5 = + "($dob >= \"19740301\" AND $dob <= \"19761030\")" + " OR ($dod > \"20200101\" AND $dod < \"20230101\")", + + {ok, Filter5} = generate_filter_expression(FE5, maps:new()), + F = fun(M) -> apply_filter(Filter5, M) end, + + M5 = maps:from_list([{<<"dob">>, <<"19750202">>}, {<<"dod">>, <<"20221216">>}]), + M6 = maps:from_list([{<<"dob">>, <<"19750202">>}, {<<"dod">>, <<"20191216">>}]), + M7 = maps:from_list([{<<"dob">>, <<"19790202">>}, {<<"dod">>, <<"20221216">>}]), + M8 = maps:from_list([{<<"dob">>, <<"19790202">>}, {<<"dod">>, <<"20191216">>}]), + M9 = maps:from_list([{<<"dob">>, <<"19790202">>}, {<<"dod">>, <<"20241216">>}]), + M10 = maps:new(), + ?assertMatch(true, F(M5)), + ?assertMatch(true, F(M6)), + ?assertMatch(true, F(M7)), + ?assertMatch(false, F(M8)), + ?assertMatch(false, F(M9)), + ?assertMatch(false, F(M10)), + + FE5A = + "($dob >= \"19740301\" AND $dob <= \"19761030\")" + " AND ($dod = \"20221216\")", + {ok, Filter5A} = generate_filter_expression(FE5A, maps:new()), + ?assert(apply_filter(Filter5A, M5)), + ?assertNot(apply_filter(Filter5A, M6)), + FE5B = + "$dob >= \"19740301\" AND $dob <= \"19761030\"" + " AND $dod = \"20221216\"", + {ok, Filter5B} = generate_filter_expression(FE5B, maps:new()), + ?assert(apply_filter(Filter5B, M5)), + ?assertNot(apply_filter(Filter5B, M6)), + + FE6 = + "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")" + " OR $dob <> \"19993112\"", + {ok, Filter6} = generate_filter_expression(FE6, maps:new()), + M11 = maps:from_list([{<<"dob">>, <<"19993112">>}]), + ?assertMatch(false, apply_filter(Filter6, M11)), + + FE7 = + "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")" + " OR $dob = \"19993112\"", + {ok, Filter7} = generate_filter_expression(FE7, maps:new()), + ?assert(apply_filter(Filter7, M11)), + + FE8 = "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")" + " OR $dob IN (\"19910301\", \"19910103\")", + {ok, Filter8} = generate_filter_expression(FE8, maps:new()), + ?assert(apply_filter(Filter8, #{<<"dob">> => <<"19910301">>})), + ?assert(apply_filter(Filter8, #{<<"dob">> => <<"19910103">>})), + ?assertNot(apply_filter(Filter8, #{<<"dob">> => <<"19910102">>})), + ?assertNot(apply_filter(Filter8, #{<<"gn">> => <<"Nikki">>})), + + FE9 = "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")" + " OR $dob IN (\"19910301\", 19910103)", + % Only match with a type match + {ok, Filter9} = generate_filter_expression(FE9, maps:new()), + ?assert(apply_filter(Filter9, #{<<"dob">> => <<"19910301">>})), + ?assert(apply_filter(Filter9, #{<<"dob">> => 19910103})), + ?assertNot(apply_filter(Filter9, #{<<"dob">> => 19910301})), + ?assertNot(apply_filter(Filter9, #{<<"dob">> => <<"19910103">>})), + + FE10 = "NOT contains($gn, \"MA\") AND " + "(NOT $dob IN (\"19910301\", \"19910103\"))", + {ok, Filter10} = generate_filter_expression(FE10, maps:new()), + ?assert( + apply_filter( + Filter10, + #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910201">>})), + ?assertNot( + apply_filter( + Filter10, + #{<<"gn">> => <<"EMMA">>, <<"dob">> => <<"19910201">>})), + ?assertNot( + apply_filter( + Filter10, + #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910301">>})), + + FE11 = "NOT contains($gn, \"MA\") AND " + "NOT $dob IN (\"19910301\", \"19910103\")", + {ok, Filter11} = generate_filter_expression(FE11, maps:new()), + ?assert( + apply_filter( + Filter11, + #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910201">>})), + ?assertNot( + apply_filter( + Filter11, + #{<<"gn">> => <<"EMMA">>, <<"dob">> => <<"19910201">>})), + ?assertNot( + apply_filter( + Filter11, + #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910301">>})), + + FE12 = "begins_with($gn, \"MA\") AND begins_with($fn, :fn)", + {ok, Filter12} = generate_filter_expression(FE12, #{<<"fn">> => <<"SU">>}), + ?assert( + apply_filter( + Filter12, + #{<<"gn">> => <<"MATTY">>, <<"fn">> => <<"SUMMER">>})), + ?assertNot( + apply_filter( + Filter12, + #{<<"gn">> => <<"MITTY">>, <<"fn">> => <<"SUMMER">>})), + ?assertNot( + apply_filter( + Filter12, + #{<<"gn">> => <<"MATTY">>, <<"fn">> => <<"SIMMS">>})), + ?assertNot( + apply_filter( + Filter12, + #{<<"gn">> => 42, <<"fn">> => <<"SUMMER">>})), + + FE13 = "attribute_exists($dob) AND attribute_not_exists($consent) " + "AND attribute_empty($dod)", + {ok, Filter13} = generate_filter_expression(FE13, maps:new()), + ?assert( + apply_filter( + Filter13, + #{<<"dob">> => <<"19440812">>, <<"dod">> => <<>>})), + ?assertNot( + apply_filter( + Filter13, + #{<<"dod">> => <<>>})), + ?assertNot( + apply_filter( + Filter13, + #{<<"dob">> => <<"19440812">>, + <<"consent">> => <<>>, + <<"dod">> => <<>>})), + ?assertNot( + apply_filter( + Filter13, + #{<<"dob">> => <<"19440812">>, <<"dod">> => <<"20240213">>})), + + FE14 = "\"M1\" IN $gns", + {ok, Filter14} = generate_filter_expression(FE14, maps:new()), + ?assert( + apply_filter( + Filter14, + #{<<"gns">> => [<<"MA">>, <<"M1">>, <<"A0">>]})), + ?assertNot( + apply_filter( + Filter14, + #{<<"gns">> => [<<"MA">>, <<"M2">>, <<"A0">>]})), + ?assertNot( + apply_filter( + Filter14, + #{<<"gns">> => <<"M1">>})) + . + +-endif. diff --git a/src/leveled_filterlexer.xrl b/src/leveled_filterlexer.xrl new file mode 100644 index 00000000..6d779f41 --- /dev/null +++ b/src/leveled_filterlexer.xrl @@ -0,0 +1,49 @@ +%% Lexer for filter and conditional expressions +%% Author: Thomas Arts + +Definitions. +WhiteSpace = ([\t\f\v\r\n\s]+) + +Rules. + +{WhiteSpace} : skip_token. + +\( : {token, {'(', TokenLine}}. +\) : {token, {')', TokenLine}}. + +, : {token, {',', TokenLine}}. +NOT : {token, {'NOT', TokenLine}}. +AND : {token, {'AND', TokenLine}}. +OR : {token, {'OR', TokenLine}}. +BETWEEN : {token, {'BETWEEN', TokenLine}}. +IN : {token, {'IN', TokenLine}}. += : {token, {comparator, '=', TokenLine}}. +< : {token, {comparator, '<', TokenLine}}. +> : {token, {comparator, '>', TokenLine}}. +<> : {token, {comparator, '<>', TokenLine}}. +<= : {token, {comparator, '<=', TokenLine}}. +>= : {token, {comparator, '>=', TokenLine}}. + +contains : {token, {contains, TokenLine}}. +begins_with : {token, {begins_with, TokenLine}}. +attribute_exists : {token, {attribute_exists, TokenLine}}. +attribute_not_exists : {token, {attribute_not_exists, TokenLine}}. +attribute_empty : {token, {attribute_empty, TokenLine}}. + +\$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}. +\:[a-zA-Z_][a-zA-Z_0-9]* : {token, {substitution, TokenLine, strip_substitution(TokenChars)}}. +[0-9]+ : {token, {integer, TokenLine, list_to_integer(TokenChars)}}. +\"[^"]*\" : {token, {string, TokenLine, strip_string(TokenChars)}}. %" + +Erlang code. + +strip_string(TokenChars) -> + list_to_binary(lists:sublist(TokenChars, 2, length(TokenChars) -2)). + +strip_identifier(TokenChars) -> + [36|StrippedChars] = TokenChars, + list_to_binary(StrippedChars). + +strip_substitution(TokenChars) -> + [58|StrippedChars] = TokenChars, + list_to_binary(StrippedChars). \ No newline at end of file diff --git a/src/leveled_filterparser.yrl b/src/leveled_filterparser.yrl new file mode 100644 index 00000000..ca060eb9 --- /dev/null +++ b/src/leveled_filterparser.yrl @@ -0,0 +1,51 @@ +%% Grammar for filter expressions +%% Author: Thomas Arts + +Nonterminals +top_level condition operand operand_list operands. + + +Terminals +'(' ')' comparator identifier string integer +',' +'NOT' 'AND' 'OR' 'IN' 'BETWEEN' +contains begins_with +attribute_exists attribute_not_exists attribute_empty. + + +Rootsymbol top_level. + +top_level -> condition: {condition, '$1'}. + +condition -> operand comparator operand : {'$2', '$1', '$3'}. +condition -> operand 'BETWEEN' string 'AND' string : {'BETWEEN', '$1', '$3', '$5'}. +condition -> operand 'BETWEEN' integer 'AND' integer : {'BETWEEN', '$1', '$3', '$5'}. +condition -> operand 'IN' operand_list : {'IN', '$1', '$3'}. +condition -> operand 'IN' identifier : {'IN', '$1', '$3'}. +condition -> condition 'AND' condition : {'AND', '$1', '$3'}. +condition -> condition 'OR' condition : {'OR', '$1', '$3'}. +condition -> 'NOT' condition : {'NOT', '$2'}. +condition -> contains '(' identifier ',' string ')' : {contains, '$3', '$5'}. +condition -> begins_with '(' identifier ',' string ')' : {begins_with, '$3', '$5'}. +condition -> attribute_exists '(' identifier ')' : {attribute_exists, '$3'}. +condition -> attribute_not_exists '(' identifier ')' : {attribute_not_exists, '$3'}. +condition -> attribute_empty '(' identifier ')' : {attribute_empty, '$3'}. +condition -> '(' condition ')' : '$2'. + +operand -> identifier : '$1'. +operand -> string : '$1'. +operand -> integer : '$1'. + +operand_list -> '(' operands ')' : '$2'. + +operands -> operand ',' operands : ['$1' | '$3']. +operands -> operand : ['$1']. + +Endsymbol '$end'. + +Right 200 'NOT'. +Nonassoc 200 comparator. +Left 150 'AND'. +Left 100 'OR'. + +Erlang code. diff --git a/src/leveled_runner.erl b/src/leveled_runner.erl index 4b221b09..15704b20 100644 --- a/src/leveled_runner.erl +++ b/src/leveled_runner.erl @@ -130,12 +130,11 @@ bucket_list(SnapFun, Tag, FoldBucketsFun, InitAcc, MaxBuckets) -> end, {async, Runner}. --spec index_query(snap_fun(), - {leveled_codec:ledger_key(), - leveled_codec:ledger_key(), - {boolean(), undefined|mp()|iodata()}}, - {fold_keys_fun(), foldacc()}) - -> {async, runner_fun()}. +-spec index_query( + snap_fun(), + {leveled_codec:ledger_key(), leveled_codec:ledger_key(), + {boolean()|binary(), leveled_codec:regular_expression()}}, + {fold_keys_fun(), foldacc()}) -> {async, runner_fun()}. %% @doc %% Secondary index query %% This has the special capability that it will expect a message to be thrown @@ -681,7 +680,7 @@ accumulate_keys(FoldKeysFun, undefined) -> accumulate_keys(FoldKeysFun, TermRegex) -> fun(Key, _Value, Acc) -> {B, K} = leveled_codec:from_ledgerkey(Key), - case leveled_util:regex_run(K, TermRegex) of + case leveled_util:regex_run(K, TermRegex, []) of nomatch -> Acc; _ -> diff --git a/src/leveled_util.erl b/src/leveled_util.erl index 0817b32b..fdf32ae5 100644 --- a/src/leveled_util.erl +++ b/src/leveled_util.erl @@ -11,8 +11,9 @@ magic_hash/1, t2b/1, safe_rename/4, - regex_run/2, - regex_compile/1 + regex_run/3, + regex_compile/1, + regex_compile/2 ]). -define(WRITE_OPS, [binary, raw, read, write]). @@ -42,17 +43,38 @@ integer_time(TS) -> DT = calendar:now_to_universal_time(TS), calendar:datetime_to_gregorian_seconds(DT). + +-type match_option() :: + 'caseless' | + {'offset', non_neg_integer()} | + {'capture', value_spec()} | + {'capture', value_spec(), value_spec_type()}. +-type value_spec() :: + 'all' | 'all_but_first' | 'first' | 'none' | [value_id()]. +-type value_spec_type() :: 'binary'. +-type value_id() :: binary(). +-type match_index() :: {non_neg_integer(), non_neg_integer()}. + -spec regex_run( - iodata(), any()) -> - match | nomatch | {match, list()} | {error, atom()}. -regex_run(Subject, CompiledRE2) when is_reference(CompiledRE2) -> - re2:run(Subject, CompiledRE2); -regex_run(Subject, CompiledPCRE) -> - re:run(Subject, CompiledPCRE). - --spec regex_compile(iodata()) -> reference(). + iodata(), leveled_codec:actual_regex(), list(match_option())) -> + match | + nomatch | + {match, list(match_index())} | + {match, list(binary())} | + {error, atom()}. +regex_run(Subject, CompiledRE2, Opts) when is_reference(CompiledRE2) -> + re2:run(Subject, CompiledRE2, Opts); +regex_run(Subject, CompiledPCRE, Opts) -> + re:run(Subject, CompiledPCRE, Opts). + +-spec regex_compile(iodata()) -> {ok, reference()}. regex_compile(PlainRegex) -> - re2:compile(PlainRegex). + regex_compile(PlainRegex, re2). + +regex_compile(PlainRegex, re2) -> + re2:compile(PlainRegex); +regex_compile(PlainRegex, pcre) -> + re:compile(PlainRegex). -spec magic_hash(any()) -> 0..16#FFFFFFFF. %% @doc diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl index a72f36e9..74e71f79 100644 --- a/test/end_to_end/iterator_SUITE.erl +++ b/test/end_to_end/iterator_SUITE.erl @@ -1,6 +1,5 @@ -module(iterator_SUITE). --include_lib("common_test/include/ct.hrl"). -include("include/leveled.hrl"). -define(KEY_ONLY, {false, undefined}). @@ -13,7 +12,9 @@ query_count/1, multibucket_fold/1, foldobjects_bybucket_range/1, - rotating_objects/1]). + rotating_objects/1, + capture_and_filter_terms/1 + ]). all() -> [ expiring_indexes, @@ -23,10 +24,10 @@ all() -> [ query_count, multibucket_fold, rotating_objects, - foldobjects_bybucket_range + foldobjects_bybucket_range, + capture_and_filter_terms ]. - expiring_indexes(_Config) -> % Add objects to the store with index entries, where the objects (and hence % the indexes have an expiry time. Confirm that the indexes and the @@ -48,11 +49,11 @@ expiring_indexes(_Config) -> SW1 = os:timestamp(), timer:sleep(1000), - V9 = testutil:get_compressiblevalue(), + V1 = <<"V1">>, Indexes9 = testutil:get_randomindexes_generator(2), TempRiakObjects = testutil:generate_objects( - KeyCount, binary_uuid, [], V9, Indexes9, "riakBucket"), + KeyCount, binary_uuid, [], V1, Indexes9, "riakBucket"), IBKL1 = testutil:stdload_expiring(Bookie1, KeyCount, Future), lists:foreach( @@ -142,11 +143,6 @@ expiring_indexes(_Config) -> Bookie1, B0, K0, 5, <<"value">>, leveled_util:integer_now() + 10), timer:sleep(1000), {async, Folder2} = IndexFold(), - leveled_bookie:book_indexfold(Bookie1, - B0, - {FoldFun, InitAcc}, - {<<"temp_int">>, 5, 8}, - {true, undefined}), QR2 = Folder2(), io:format("Query with additional entry length ~w~n", [length(QR2)]), true = lists:sort(QR2) == lists:sort([{5, B0, K0}|LoadedEntriesInRange]), @@ -201,13 +197,11 @@ breaking_folds(_Config) -> {max_journalsize, 10000000}, {sync_strategy, testutil:sync_strategy()}], {ok, Bookie1} = leveled_bookie:book_start(StartOpts1), - ObjectGen = testutil:get_compressiblevalue_andinteger(), + V1 = testutil:get_compressiblevalue_andinteger(), IndexGen = testutil:get_randomindexes_generator(8), - ObjL1 = testutil:generate_objects(KeyCount, - binary_uuid, - [], - ObjectGen, - IndexGen), + ObjL1 = + testutil:generate_objects( + KeyCount, binary_uuid, [], V1, IndexGen), testutil:riakload(Bookie1, ObjL1), % Find all keys index, and then same again but stop at a midpoint using a @@ -282,10 +276,11 @@ breaking_folds(_Config) -> end end, {async, HeadFolderToMidK} = - leveled_bookie:book_headfold(Bookie1, - ?RIAK_TAG, - {FoldThrowFun(HeadFoldFun), []}, - true, true, false), + leveled_bookie:book_headfold( + Bookie1, + ?RIAK_TAG, + {FoldThrowFun(HeadFoldFun), []}, + true, true, false), KeySizeList2 = lists:reverse(CatchingFold(HeadFolderToMidK)), io:format("Head fold with result size ~w~n", [length(KeySizeList2)]), true = KeyCount div 2 == length(KeySizeList2), @@ -295,21 +290,19 @@ breaking_folds(_Config) -> [{K,byte_size(V)}|Acc] end, {async, ObjectFolderKO} = - leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {ObjFoldFun, []}, - false, - key_order), + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, {ObjFoldFun, []}, false, key_order), ObjSizeList1 = lists:reverse(ObjectFolderKO()), io:format("Obj fold with result size ~w~n", [length(ObjSizeList1)]), true = KeyCount == length(ObjSizeList1), {async, ObjFolderToMidK} = - leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {FoldThrowFun(ObjFoldFun), []}, - false, - key_order), + leveled_bookie:book_objectfold( + Bookie1, + ?RIAK_TAG, + {FoldThrowFun(ObjFoldFun), []}, + false, + key_order), ObjSizeList2 = lists:reverse(CatchingFold(ObjFolderToMidK)), io:format("Object fold with result size ~w~n", [length(ObjSizeList2)]), true = KeyCount div 2 == length(ObjSizeList2), @@ -319,11 +312,8 @@ breaking_folds(_Config) -> % that was terminated by reaching a point in the key range .. as results % will not be passed to the fold function in key order {async, ObjectFolderSO} = - leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {ObjFoldFun, []}, - false, - sqn_order), + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, {ObjFoldFun, []}, false, sqn_order), ObjSizeList1_SO = lists:reverse(ObjectFolderSO()), io:format("Obj fold with result size ~w~n", [length(ObjSizeList1_SO)]), true = KeyCount == length(ObjSizeList1_SO), @@ -341,33 +331,26 @@ breaking_folds(_Config) -> end end, {async, ObjFolderTo1K} = - leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {FoldThrowThousandFun(ObjFoldFun), []}, - false, - sqn_order), + leveled_bookie:book_objectfold( + Bookie1, + ?RIAK_TAG, + {FoldThrowThousandFun(ObjFoldFun), []}, + false, + sqn_order), ObjSizeList2_SO = lists:reverse(CatchingFold(ObjFolderTo1K)), io:format("Object fold with result size ~w~n", [length(ObjSizeList2_SO)]), true = 1000 == length(ObjSizeList2_SO), - ObjL2 = testutil:generate_objects(10, - binary_uuid, - [], - ObjectGen, - IndexGen, - "B2"), - ObjL3 = testutil:generate_objects(10, - binary_uuid, - [], - ObjectGen, - IndexGen, - "B3"), - ObjL4 = testutil:generate_objects(10, - binary_uuid, - [], - ObjectGen, - IndexGen, - "B4"), + ObjectGen = testutil:get_compressiblevalue_andinteger(), + ObjL2 = + testutil:generate_objects( + 10, binary_uuid, [], ObjectGen, IndexGen, "B2"), + ObjL3 = + testutil:generate_objects( + 10, binary_uuid, [], ObjectGen, IndexGen, "B3"), + ObjL4 = + testutil:generate_objects( + 10, binary_uuid, [], ObjectGen, IndexGen, "B4"), testutil:riakload(Bookie1, ObjL2), testutil:riakload(Bookie1, ObjL3), testutil:riakload(Bookie1, ObjL4), @@ -391,15 +374,12 @@ breaking_folds(_Config) -> end, {async, StopAt3BucketFolder} = - leveled_bookie:book_bucketlist(Bookie1, - ?RIAK_TAG, - {StopAt3Fun, []}, - all), + leveled_bookie:book_bucketlist( + Bookie1, ?RIAK_TAG, {StopAt3Fun, []}, all), BucketListSA3 = lists:reverse(CatchingFold(StopAt3BucketFolder)), io:format("bucket list with result ~w~n", [BucketListSA3]), true = [<<"B2">>, <<"B3">>] == BucketListSA3, - ok = leveled_bookie:book_close(Bookie1), testutil:reset_filestructure(). @@ -501,32 +481,29 @@ small_load_with2i(_Config) -> true = 1 == length(KeyList2), %% Delete the objects from the ChkList removing the indexes - lists:foreach(fun({_RN, Obj, Spc}) -> - DSpc = lists:map(fun({add, F, T}) -> - {remove, F, T} - end, - Spc), - {B, K} = - {testutil:get_bucket(Obj), testutil:get_key(Obj)}, - testutil:book_riakdelete(Bookie1, B, K, DSpc) - end, - ChkList1), + lists:foreach( + fun({_RN, Obj, Spc}) -> + DSpc = lists:map(fun({add, F, T}) -> + {remove, F, T} + end, + Spc), + {B, K} = + {testutil:get_bucket(Obj), testutil:get_key(Obj)}, + testutil:book_riakdelete(Bookie1, B, K, DSpc) + end, + ChkList1), %% Get the Buckets Keys and Hashes for the whole bucket - FoldObjectsFun = fun(B, K, V, Acc) -> [{B, K, erlang:phash2(V)}|Acc] - end, + FoldObjectsFun = + fun(B, K, V, Acc) -> [{B, K, erlang:phash2(V)}|Acc] end, - {async, HTreeF1} = leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - {FoldObjectsFun, []}, - false), + {async, HTreeF1} = + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, {FoldObjectsFun, []}, false), KeyHashList1 = HTreeF1(), - {async, HTreeF2} = leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - "Bucket", - all, - {FoldObjectsFun, []}, - false), + {async, HTreeF2} = + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, "Bucket", all, {FoldObjectsFun, []}, false), KeyHashList2 = HTreeF2(), {async, HTreeF3} = leveled_bookie:book_objectfold( @@ -541,10 +518,11 @@ small_load_with2i(_Config) -> true = 9900 == length(KeyHashList2), true = 9900 == length(KeyHashList3), - SumIntFun = fun(_B, _K, Obj, Acc) -> - {I, _Bin} = testutil:get_value(Obj), - Acc + I - end, + SumIntFun = + fun(_B, _K, Obj, Acc) -> + {I, _Bin} = testutil:get_value(Obj), + Acc + I + end, BucketObjQ = {foldobjects_bybucket, ?RIAK_TAG, "Bucket", all, {SumIntFun, 0}, true}, {async, Sum1} = leveled_bookie:book_returnfolder(Bookie1, BucketObjQ), @@ -559,8 +537,9 @@ small_load_with2i(_Config) -> lists:foldl(SumFromObjLFun, 0, ObjL1), ChkList1Total = lists:foldl(SumFromObjLFun, 0, ChkList1), - io:format("Total in original object list ~w and from removed list ~w~n", - [ObjL1Total, ChkList1Total]), + io:format( + "Total in original object list ~w and from removed list ~w~n", + [ObjL1Total, ChkList1Total]), Total1 = ObjL1Total - ChkList1Total, @@ -598,7 +577,7 @@ query_count(_Config) -> testutil:check_forobject(Book1, TestObject), lists:foreach( fun(_X) -> - V = testutil:get_compressiblevalue(), + V = <<"TestValue">>, Indexes = testutil:get_randomindexes_generator(8), SW = os:timestamp(), ObjL1 = @@ -677,7 +656,7 @@ query_count(_Config) -> Mia2000Count2 = lists:foldl( fun({Term, _Key}, Acc) -> - case leveled_util:regex_run(Term, RegMia) of + case leveled_util:regex_run(Term, RegMia, []) of nomatch -> Acc; _ -> @@ -824,10 +803,9 @@ query_count(_Config) -> ok = leveled_bookie:book_close(Book4), - {ok, Book5} = leveled_bookie:book_start(RootPath, - 2000, - 50000000, - testutil:sync_strategy()), + {ok, Book5} = + leveled_bookie:book_start( + RootPath, 2000, 50000000, testutil:sync_strategy()), {async, BLF3} = leveled_bookie:book_returnfolder(Book5, BucketListQuery), SW_QC = os:timestamp(), BucketSet3 = BLF3(), @@ -840,6 +818,315 @@ query_count(_Config) -> testutil:reset_filestructure(). +capture_and_filter_terms(_Config) -> + RootPath = testutil:reset_filestructure(), + Bucket = {<<"Type1">>, <<"Bucket1">>}, + IdxName = <<"people_bin">>, + {ok, Book1} = + leveled_bookie:book_start( + RootPath, 2000, 50000000, testutil:sync_strategy()), + V1 = <<"V1">>, + IndexGen = + fun() -> + [{add, IdxName, list_to_binary(perf_SUITE:random_people_index())}] + end, + ObjL1 = + testutil:generate_objects( + 100000, uuid, [], V1, IndexGen, Bucket), + testutil:riakload(Book1, ObjL1), + + StartDoB = <<"19740301">>, + EndDoB = <<"19761031">>, + + WillowLeedsFinder = + "[^\\|]*\\|[0-9]{8}\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" + "[^\\|]*#LS[^\\|]*", + + SW0 = os:timestamp(), + {ok, WillowLeedsPCRE} = re:compile(WillowLeedsFinder), + + QueryPCRE0 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {true, WillowLeedsPCRE}}, + {async, Runner0} = leveled_bookie:book_returnfolder(Book1, QueryPCRE0), + Results0 = Runner0(), + BornMid70s0 = + lists:filtermap( + fun({IdxValue, Key}) -> + DoB = + list_to_binary( + lists:nth( + 2, + string:tokens(binary_to_list(IdxValue), "|") + ) + ), + case (DoB >= StartDoB) andalso (DoB =< EndDoB) of + true -> + {true, Key}; + false -> + false + end + end, + Results0 + ), + + SW1 = os:timestamp(), + + WillowLeedsExtractor = + "[^\\|]*\\|(?P[0-9]{8})\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" + "[^\\|]*#LS[^\\|]*", + FilterFun1 = + fun(Captures) -> + DoB = maps:get(<<"dob">>, Captures, notfound), + (DoB >= StartDoB) andalso (DoB =< EndDoB) + end, + EvalFunPCRE = + leveled_eval:generate_eval_function( + "regex($term, :regex, pcre, ($dob))", + #{<<"regex">> => list_to_binary(WillowLeedsExtractor)} + ), + + QueryPCRE1 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, {eval, EvalFunPCRE, FilterFun1}} + }, + {async, RunnerPCRE1} = leveled_bookie:book_returnfolder(Book1, QueryPCRE1), + BornMid70sPCRE1 = RunnerPCRE1(), + + SW2 = os:timestamp(), + + EvalFunRE2 = + leveled_eval:generate_eval_function( + "regex($term, :regex, re2, ($dob))", + #{<<"regex">> => list_to_binary(WillowLeedsExtractor)} + ), + QueryRE2_2 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, {eval, EvalFunRE2, FilterFun1}} + }, + {async, RunnerRE2_2} = leveled_bookie:book_returnfolder(Book1, QueryRE2_2), + BornMid70sRE2_2 = RunnerRE2_2(), + + SW3 = os:timestamp(), + + AllFun = fun(_) -> true end, + QueryRE2_3 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {<<"dob">>, {eval, EvalFunRE2, AllFun}} + }, + {async, RunnerRE2_3} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3), + Results3 = RunnerRE2_3(), + BornMid70sRE2_3 = + lists:filtermap( + fun({DoB, Key}) -> + case (DoB >= StartDoB) andalso (DoB =< EndDoB) of + true -> + {true, Key}; + false -> + false + end + end, + Results3 + ), + + SW4 = os:timestamp(), + + WillowLeedsDoubleExtractor = + "[^\\|]*\\|(?P[0-9]{8})\\|(?P[0-9]{0,8})\\|" + "[^\\|]*#Willow[^\\|]*\\|[^\\|]*#LS[^\\|]*", + EvalFunRE2_2 = + leveled_eval:generate_eval_function( + "regex($term, :regex, re2, ($dob, $dod))", + #{<<"regex">> => list_to_binary(WillowLeedsDoubleExtractor)} + ), + + FilterFun2 = + fun(Captures) -> + DoB = maps:get(<<"dob">>, Captures, notfound), + (DoB >= StartDoB) andalso (DoB =< EndDoB) + end, + QueryRE2_4 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, {eval, EvalFunRE2_2, FilterFun2}} + }, + {async, RunnerRE2_4} = leveled_bookie:book_returnfolder(Book1, QueryRE2_4), + BornMid70sRE2_4 = RunnerRE2_4(), + + SW5 = os:timestamp(), + + QueryRE2_5 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {true, {eval, EvalFunRE2, FilterFun1}} + }, + {async, RunnerRE2_5} = leveled_bookie:book_returnfolder(Book1, QueryRE2_5), + {ok, WillowLeedsExtractorRE2} = re2:compile(WillowLeedsExtractor), + BornMid70sRE2_5 = + lists:filtermap( + fun({T, K}) -> + {match, _} = + leveled_util:regex_run(T, WillowLeedsExtractorRE2, []), + {true, K} + end, + RunnerRE2_5()), + + SW8 = os:timestamp(), + + FilterExpression1 = "($dob BETWEEN \"19740301\" AND \"19761030\")", + {ok, FilterExpressionParsed1} = + leveled_filter:generate_filter_expression( + FilterExpression1, maps:new()), + FilterFun5 = + fun(AttrMap) -> + leveled_filter:apply_filter(FilterExpressionParsed1, AttrMap) + end, + + QueryRE2_8 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, {eval, EvalFunRE2, FilterFun5}} + }, + {async, RunnerRE2_8} = leveled_bookie:book_returnfolder(Book1, QueryRE2_8), + BornMid70sRE2_8 = RunnerRE2_8(), + + SW9 = os:timestamp(), + + PreFilterRE = + "[^\\|]*\\|(?P197[4-6]{1}[0-9]{4})\\|" + "[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" + "[^\\|]*#LS[^\\|]*", + PreFilterEvalFun = + leveled_eval:generate_eval_function( + "regex($term, :regex, re2, ($dob))", + #{<<"regex">> => list_to_binary(PreFilterRE)} + ), + + QueryRE2_9 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, {eval, PreFilterEvalFun, FilterFun5}} + }, + {async, RunnerRE2_9} = leveled_bookie:book_returnfolder(Book1, QueryRE2_9), + BornMid70sRE2_9 = RunnerRE2_9(), + + SW10 = os:timestamp(), + + WillowLeedsExtractor = + "[^\\|]*\\|(?P[0-9]{8})\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|" + "[^\\|]*#LS[^\\|]*", + + FilterExpression2 = + "($dob BETWEEN \"19740301\" AND \"19761030\")" + "AND (contains($gns, \"#Willow\") AND contains($pcs, \"#LS\"))", + {ok, FilterExpressionParsed2} = + leveled_filter:generate_filter_expression( + FilterExpression2, maps:new()), + FilterFun6 = + fun(AttrMap) -> + leveled_filter:apply_filter(FilterExpressionParsed2, AttrMap) + end, + {ok, EvalExpression2} = + leveled_eval:generate_eval_expression( + "delim($term, \"|\", ($surname, $dob, $dod, $gns, $pcs))", + maps:new() + ), + EvalFun2 = + fun(Term, Key) -> + leveled_eval:apply_eval(EvalExpression2, Term, Key, maps:new()) + end, + QueryRE2_10 = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {false, {eval, EvalFun2, FilterFun6}} + }, + {async, RunnerRE2_10} = leveled_bookie:book_returnfolder(Book1, QueryRE2_10), + BornMid70sRE2_10 = RunnerRE2_10(), + + SW11 = os:timestamp(), + + true = length(BornMid70s0) > 0, + + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sPCRE1), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_2), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_3), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_4), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_5), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_8), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_9), + true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_10), + + maybe_log_toscreen( + "~nFilter outside took ~w ms~n", + [timer:now_diff(SW1, SW0) div 1000]), + maybe_log_toscreen( + "~nPCRE Capture filter inside took ~w ms~n", + [timer:now_diff(SW2, SW1) div 1000]), + maybe_log_toscreen( + "~nRE2 Capture filter inside took ~w ms~n", + [timer:now_diff(SW3, SW2) div 1000]), + maybe_log_toscreen( + "~nRE2 Capture filter outside took ~w ms~n", + [timer:now_diff(SW4, SW3) div 1000]), + maybe_log_toscreen( + "~nRE2 double-capture filter outside took ~w ms~n", + [timer:now_diff(SW5, SW4) div 1000]), + maybe_log_toscreen( + "~nRE2 single-capture filter with parsed filter expression took ~w ms~n", + [timer:now_diff(SW9, SW8) div 1000]), + maybe_log_toscreen( + "~nRE2 single-capture pre-filter with parsed query string took ~w ms~n", + [timer:now_diff(SW10, SW9) div 1000]), + maybe_log_toscreen( + "~nEval processed index with parsed filter expression took ~w ms~n", + [timer:now_diff(SW11, SW10) div 1000]), + + + QueryRE2_3_WrongCapture = + {index_query, + {Bucket, null}, + {fun testutil:foldkeysfun/3, []}, + {IdxName, <<"M">>, <<"Z">>}, + {<<"gns">>, {eval, EvalFunRE2, FilterFun6}} + }, + {async, RunnerRE2_3_WC} = + leveled_bookie:book_returnfolder(Book1, QueryRE2_3_WrongCapture), + true = [] == RunnerRE2_3_WC(), + + ok = leveled_bookie:book_close(Book1), + + testutil:reset_filestructure(). + +maybe_log_toscreen(Log, Subs) -> + io:format( + % user, + Log, + Subs + ). + + count_termsonindex(Bucket, IdxField, Book, QType) -> lists:foldl( fun(X, Acc) -> @@ -865,39 +1152,30 @@ count_termsonindex(Bucket, IdxField, Book, QType) -> multibucket_fold(_Config) -> RootPath = testutil:reset_filestructure(), - {ok, Bookie1} = leveled_bookie:book_start(RootPath, - 2000, - 50000000, - testutil:sync_strategy()), - ObjectGen = testutil:get_compressiblevalue_andinteger(), + {ok, Bookie1} = + leveled_bookie:book_start( + RootPath, 2000, 50000000, testutil:sync_strategy()), + ObjectGen = <<"V1">>, IndexGen = fun() -> [] end, - ObjL1 = testutil:generate_objects(13000, - uuid, - [], - ObjectGen, - IndexGen, - {<<"Type1">>, <<"Bucket1">>}), + B1 = {<<"Type1">>, <<"Bucket1">>}, + B2 = <<"Bucket2">>, + B3 = <<"Bucket3">>, + B4 = {<<"Type2">>, <<"Bucket4">>}, + ObjL1 = + testutil:generate_objects( + 13000, uuid, [], ObjectGen, IndexGen, B1), testutil:riakload(Bookie1, ObjL1), - ObjL2 = testutil:generate_objects(17000, - uuid, - [], - ObjectGen, - IndexGen, - <<"Bucket2">>), + ObjL2 = + testutil:generate_objects( + 17000, uuid, [], ObjectGen, IndexGen, B2), testutil:riakload(Bookie1, ObjL2), - ObjL3 = testutil:generate_objects(7000, - uuid, - [], - ObjectGen, - IndexGen, - <<"Bucket3">>), + ObjL3 = + testutil:generate_objects( + 7000, uuid, [], ObjectGen, IndexGen, B3), testutil:riakload(Bookie1, ObjL3), - ObjL4 = testutil:generate_objects(23000, - uuid, - [], - ObjectGen, - IndexGen, - {<<"Type2">>, <<"Bucket4">>}), + ObjL4 = + testutil:generate_objects( + 23000, uuid, [], ObjectGen, IndexGen, B4), testutil:riakload(Bookie1, ObjL4), FF = fun(B, K, _PO, Acc) -> @@ -964,44 +1242,37 @@ rotating_objects(_Config) -> foldobjects_bybucket_range(_Config) -> RootPath = testutil:reset_filestructure(), - {ok, Bookie1} = leveled_bookie:book_start(RootPath, - 2000, - 50000000, - testutil:sync_strategy()), + {ok, Bookie1} = + leveled_bookie:book_start( + RootPath, 2000, 50000000, testutil:sync_strategy()), ObjectGen = testutil:get_compressiblevalue_andinteger(), IndexGen = fun() -> [] end, - ObjL1 = testutil:generate_objects(1300, - {fixed_binary, 1}, - [], - ObjectGen, - IndexGen, - <<"Bucket1">>), + ObjL1 = + testutil:generate_objects( + 1300, {fixed_binary, 1}, [], ObjectGen, IndexGen, <<"Bucket1">>), testutil:riakload(Bookie1, ObjL1), - FoldKeysFun = fun(_B, K,_V, Acc) -> - [ K |Acc] - end, + FoldKeysFun = + fun(_B, K,_V, Acc) -> [ K |Acc] end, StartKey = testutil:fixed_bin_key(123), EndKey = testutil:fixed_bin_key(779), - {async, Folder} = leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - <<"Bucket1">>, - {StartKey, EndKey}, {FoldKeysFun, []}, - true - ), + {async, Folder} = + leveled_bookie:book_objectfold( + Bookie1, + ?RIAK_TAG, + <<"Bucket1">>, + {StartKey, EndKey}, {FoldKeysFun, []}, + true + ), ResLen = length(Folder()), io:format("Length of Result of folder ~w~n", [ResLen]), true = 657 == ResLen, - {async, AllFolder} = leveled_bookie:book_objectfold(Bookie1, - ?RIAK_TAG, - <<"Bucket1">>, - all, - {FoldKeysFun, []}, - true - ), + {async, AllFolder} = + leveled_bookie:book_objectfold( + Bookie1, ?RIAK_TAG, <<"Bucket1">>, all, {FoldKeysFun, []}, true), AllResLen = length(AllFolder()), io:format("Length of Result of all keys folder ~w~n", [AllResLen]), diff --git a/test/end_to_end/perf_SUITE.erl b/test/end_to_end/perf_SUITE.erl index 93f78f10..e4c54117 100644 --- a/test/end_to_end/perf_SUITE.erl +++ b/test/end_to_end/perf_SUITE.erl @@ -5,11 +5,18 @@ -export([ riak_ctperf/1, riak_fullperf/1, riak_profileperf/1, riak_miniperf/1 ]). +-export([random_people_index/0]). -define(PEOPLE_INDEX, <<"people_bin">>). -define(MINI_QUERY_DIVISOR, 8). -define(RGEX_QUERY_DIVISOR, 32). +-ifdef(test_filter_expression). + -define(TEST_FE, true). +-else. + -define(TEST_FE, false). +-endif. + -ifndef(performance). -define(performance, riak_ctperf). -endif. @@ -563,8 +570,56 @@ random_queries(Bookie, Bucket, IDs, IdxCnt, MaxRange, IndexesReturned) -> ), TC div 1000. - random_people_queries(Bookie, Bucket, IndexesReturned) -> + random_people_queries(?TEST_FE, Bookie, Bucket, IndexesReturned). + +random_people_queries(true, Bookie, Bucket, IndexesReturned) -> + FilterExpression = + "($dob BETWEEN \"19700101\" AND \"19791231\") " + "AND (contains($gns, \"#Willow\") AND contains($pcs, \"#LS\"))", + {ok, ParsedFilter} = + leveled_filter:generate_filter_expression( + FilterExpression, maps:new()), + FilterFun = + fun(AttrMap) -> leveled_filter:apply_filter(ParsedFilter, AttrMap) end, + EvalExpression = "delim($term, \"|\", ($surname, $dob, $dod, $gns, $pcs))", + {ok, ParsedEval} = + leveled_eval:generate_eval_expression(EvalExpression, maps:new()), + EvalFun = + fun(Term, Key) -> + leveled_eval:apply_eval(ParsedEval, Term, Key, maps:new()) + end, + + QueryFun = + fun() -> + Surname = get_random_surname(), + Range = + {?PEOPLE_INDEX, + Surname, + <> + }, + FoldKeysFun = fun(_B, _K, Cnt) -> Cnt + 1 end, + {async, R} = + leveled_bookie:book_indexfold( + Bookie, + {Bucket, <<>>}, + {FoldKeysFun, 0}, + Range, + {true, {eval, EvalFun, FilterFun} + }), + R() + end, + + {TC, {QC, EF}} = + timer:tc(fun() -> run_queries(QueryFun, 0, 0, IndexesReturned) end), + ct:log( + ?INFO, + "Fetch of ~w index entries by regex in ~w queries took ~w ms" + " with filter_expression=~w", + [EF, QC, TC div 1000, true] + ), + TC div 1000; +random_people_queries(false, Bookie, Bucket, IndexesReturned) -> SeventiesWillowRegex = "[^\\|]*\\|197[0-9]{5}\\|[^\\|]*\\|" "[^\\|]*#Willow[^\\|]*\\|[^\\|]*#LS[^\\|]*", @@ -577,8 +632,7 @@ random_people_queries(Bookie, Bucket, IndexesReturned) -> Surname, <> }, - {ok, TermRegex} = - leveled_util:regex_compile(SeventiesWillowRegex), + {ok, TermRegex} = leveled_util:regex_compile(SeventiesWillowRegex), FoldKeysFun = fun(_B, _K, Cnt) -> Cnt + 1 end, {async, R} = leveled_bookie:book_indexfold( @@ -594,8 +648,9 @@ random_people_queries(Bookie, Bucket, IndexesReturned) -> timer:tc(fun() -> run_queries(QueryFun, 0, 0, IndexesReturned) end), ct:log( ?INFO, - "Fetch of ~w index entries by regex in ~w queries took ~w ms", - [EF, QC, TC div 1000] + "Fetch of ~w index entries by regex in ~w queries took ~w ms" + " with filter_expression=~w", + [EF, QC, TC div 1000, false] ), TC div 1000.