From 287e8be2a4d0ede8b4d54297e4052ec47ff477be Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 24 Apr 2024 00:41:32 +0100
Subject: [PATCH 1/8] Initial support for capture in regex

Allow for a capturing regular expression to be passed with a filter function that will filter based on the captured output.

Also allows a specific captured term to be returned as the returned term (rather than the whole index term)
---
 src/leveled_bookie.erl             |   2 +-
 src/leveled_codec.erl              |  45 ++++++-
 src/leveled_runner.erl             |  13 +-
 src/leveled_util.erl               |  34 +++--
 test/end_to_end/iterator_SUITE.erl | 206 ++++++++++++++++++++++++++++-
 test/end_to_end/perf_SUITE.erl     |   1 +
 6 files changed, 275 insertions(+), 26 deletions(-)

diff --git a/src/leveled_bookie.erl b/src/leveled_bookie.erl
index 34a2435f..ade345ca 100644
--- a/src/leveled_bookie.erl
+++ b/src/leveled_bookie.erl
@@ -696,7 +696,7 @@ book_returnfolder(Pid, RunnerType) ->
                                      IndexVal::term(),
                                      Start::IndexVal,
                                      End::IndexVal,
-                                     ReturnTerms::boolean(),
+                                     ReturnTerms::boolean()|binary(),
                                      TermRegex :: leveled_codec:regular_expression().
 
 book_indexfold(Pid, Constraint, FoldAccT, Range, TermHandling)
diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index fd1ef627..1f655fde 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -112,10 +112,13 @@
         {index_specs(), infinity|integer()}. % {KeyChanges, TTL}
 -type maybe_lookup() ::
         lookup|no_lookup.
+-type actual_regex() ::
+        reference()|{re_pattern, term(), term(), term(), term()}.
+-type capture_filter_fun() :: fun((list({binary(), binary()})) -> boolean()).
+-type capture_reference() :: 
+        {capture, actual_regex(), list(binary()), capture_filter_fun()}.
 -type regular_expression() ::
-        {re_pattern, term(), term(), term(), term()}|undefined. 
-    % first element must be re_pattern, but tuple may change legnth with
-    % versions
+        actual_regex()|undefined|capture_reference().
 
 -type value_fetcher() ::
     {fun((pid(), leveled_codec:journal_key()) -> any()),
@@ -155,6 +158,7 @@
             last_moddate/0,
             lastmod_range/0,
             regular_expression/0,
+            actual_regex/0,
             value_fetcher/0,
             proxy_object/0,
             slimmed_key/0
@@ -292,8 +296,9 @@ maybe_accumulate(
     maybe_accumulate(T, Acc, Count, Filter, AccFun).
 
 -spec accumulate_index(
-        {boolean(), undefined|leveled_runner:mp()}, leveled_runner:acc_fun())
-            -> any().
+        {boolean()|binary(),
+        regular_expression()},
+        leveled_runner:acc_fun()) -> any().
 accumulate_index({false, undefined}, FoldKeysFun) ->
     fun({?IDX_TAG, Bucket, _IndexInfo, ObjKey}, _Value, Acc) ->
         FoldKeysFun(Bucket, ObjKey, Acc)
@@ -302,9 +307,36 @@ accumulate_index({true, undefined}, FoldKeysFun) ->
     fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
         FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc)
     end;
+accumulate_index(
+        {AddTerm, {capture, TermRegex, CptKeys, FilterFun}}, FoldKeysFun) ->
+    ExpectedKeys = length(CptKeys),
+    Opts = [{capture, all_but_first, binary}],
+    fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
+        case leveled_util:regex_run(IdxValue, TermRegex, Opts) of
+            {match, CptTerms} when length(CptTerms) == ExpectedKeys ->
+                CptPairs = lists:zip(CptKeys, CptTerms),
+                case FilterFun(CptPairs) of
+                    true ->
+                        case AddTerm of
+                            true ->
+                                FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc);
+                            false ->
+                                FoldKeysFun(Bucket, ObjKey, Acc);
+                            CptKey when is_binary(CptKey) ->
+                                {CptKey, CptValue} =
+                                    lists:keyfind(CptKey, 1, CptPairs),
+                                FoldKeysFun(Bucket, {CptValue, ObjKey}, Acc)
+                        end;
+                    false ->
+                        Acc
+                end;
+            _ ->
+                Acc
+        end
+    end;
 accumulate_index({AddTerm, TermRegex}, FoldKeysFun) ->
     fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
-        case leveled_util:regex_run(IdxValue, TermRegex) of
+        case leveled_util:regex_run(IdxValue, TermRegex, []) of
             nomatch ->
                 Acc;
             _ ->
@@ -317,6 +349,7 @@ accumulate_index({AddTerm, TermRegex}, FoldKeysFun) ->
         end
     end.
 
+
 -spec key_dominates(ledger_kv(), ledger_kv()) -> boolean().
 %% @doc
 %% When comparing two keys in the ledger need to find if one key comes before 
diff --git a/src/leveled_runner.erl b/src/leveled_runner.erl
index 4b221b09..15704b20 100644
--- a/src/leveled_runner.erl
+++ b/src/leveled_runner.erl
@@ -130,12 +130,11 @@ bucket_list(SnapFun, Tag, FoldBucketsFun, InitAcc, MaxBuckets) ->
         end,
     {async, Runner}.
 
--spec index_query(snap_fun(), 
-                    {leveled_codec:ledger_key(), 
-                        leveled_codec:ledger_key(), 
-                        {boolean(), undefined|mp()|iodata()}}, 
-                    {fold_keys_fun(), foldacc()})
-                        -> {async, runner_fun()}.
+-spec index_query(
+    snap_fun(),
+    {leveled_codec:ledger_key(), leveled_codec:ledger_key(), 
+    {boolean()|binary(), leveled_codec:regular_expression()}}, 
+    {fold_keys_fun(), foldacc()}) -> {async, runner_fun()}.
 %% @doc
 %% Secondary index query
 %% This has the special capability that it will expect a message to be thrown
@@ -681,7 +680,7 @@ accumulate_keys(FoldKeysFun, undefined) ->
 accumulate_keys(FoldKeysFun, TermRegex) ->
     fun(Key, _Value, Acc) ->
         {B, K} = leveled_codec:from_ledgerkey(Key),
-        case leveled_util:regex_run(K, TermRegex) of
+        case leveled_util:regex_run(K, TermRegex, []) of
             nomatch ->
                 Acc;
             _ ->
diff --git a/src/leveled_util.erl b/src/leveled_util.erl
index 0817b32b..9f82fdac 100644
--- a/src/leveled_util.erl
+++ b/src/leveled_util.erl
@@ -11,7 +11,7 @@
             magic_hash/1,
             t2b/1,
             safe_rename/4,
-            regex_run/2,
+            regex_run/3,
             regex_compile/1
         ]).
 
@@ -42,15 +42,31 @@ integer_time(TS) ->
     DT = calendar:now_to_universal_time(TS),
     calendar:datetime_to_gregorian_seconds(DT).
 
+
+-type match_option() ::
+    'caseless' |
+    {'offset', non_neg_integer()} |
+    {'capture', value_spec()} |
+    {'capture', value_spec(), value_spec_type()}.
+-type value_spec() ::
+    'all' | 'all_but_first' | 'first' | 'none' | [value_id()].
+-type value_spec_type() :: 'binary'.
+-type value_id() :: binary().
+-type match_index() :: {non_neg_integer(), non_neg_integer()}.
+
 -spec regex_run(
-    iodata(), any()) ->
-        match | nomatch | {match, list()} | {error, atom()}.
-regex_run(Subject, CompiledRE2) when is_reference(CompiledRE2) ->
-    re2:run(Subject, CompiledRE2);
-regex_run(Subject, CompiledPCRE) ->
-    re:run(Subject, CompiledPCRE).
-
--spec regex_compile(iodata()) -> reference().
+    iodata(), leveled_codec:actual_regex(), list(match_option())) ->
+        match |
+        nomatch |
+        {match, list(match_index())} |
+        {match, list(binary())} |
+        {error, atom()}.
+regex_run(Subject, CompiledRE2, Opts) when is_reference(CompiledRE2) ->
+    re2:run(Subject, CompiledRE2, Opts);
+regex_run(Subject, CompiledPCRE, Opts) ->
+    re:run(Subject, CompiledPCRE, Opts).
+
+-spec regex_compile(iodata()) -> {ok, reference()}.
 regex_compile(PlainRegex) ->
     re2:compile(PlainRegex).
 
diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl
index a72f36e9..899601ca 100644
--- a/test/end_to_end/iterator_SUITE.erl
+++ b/test/end_to_end/iterator_SUITE.erl
@@ -13,7 +13,9 @@
             query_count/1,
             multibucket_fold/1,
             foldobjects_bybucket_range/1,
-            rotating_objects/1]).
+            rotating_objects/1,
+            capture_and_filter_terms/1
+        ]).
 
 all() -> [
             expiring_indexes,
@@ -23,7 +25,8 @@ all() -> [
             query_count,
             multibucket_fold,
             rotating_objects,
-            foldobjects_bybucket_range
+            foldobjects_bybucket_range,
+            capture_and_filter_terms
             ].
 
 
@@ -677,7 +680,7 @@ query_count(_Config) ->
     Mia2000Count2 =
         lists:foldl(
             fun({Term, _Key}, Acc) ->
-                case leveled_util:regex_run(Term, RegMia) of
+                case leveled_util:regex_run(Term, RegMia, []) of
                     nomatch ->
                         Acc;
                     _ ->
@@ -840,6 +843,203 @@ query_count(_Config) ->
     testutil:reset_filestructure().
 
 
+capture_and_filter_terms(_Config) ->
+    RootPath = testutil:reset_filestructure(),
+    Bucket = {<<"Type1">>, <<"Bucket1">>},
+    IdxName = <<"people_bin">>,
+    {ok, Book1} =
+        leveled_bookie:book_start(
+            RootPath, 2000, 50000000, testutil:sync_strategy()),
+    ObjectGen = testutil:get_compressiblevalue_andinteger(),
+    IndexGen =
+        fun() ->
+            [{add, IdxName, list_to_binary(perf_SUITE:random_people_index())}]
+        end,
+    ObjL1 =
+        testutil:generate_objects(
+            80000, uuid, [], ObjectGen, IndexGen, Bucket),
+    testutil:riakload(Book1, ObjL1),
+
+    StartDoB = <<"19740301">>,
+    EndDoB = <<"19761031">>,
+
+    WillowLeedsFinder =
+        "[^\\|]*\\|[0-9]{8}\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|"
+        "[^\\|]*#LS[^\\|]*",
+
+    SW0 = os:timestamp(),
+    {ok, WillowLeedsPCRE} = re:compile(WillowLeedsFinder),
+    QueryPCRE0 =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {true, WillowLeedsPCRE}},
+    {async, Runner0} = leveled_bookie:book_returnfolder(Book1, QueryPCRE0),
+    Results0 = Runner0(),
+    BornMid70s0 =
+        lists:filtermap(
+            fun({IdxValue, Key}) ->
+                DoB =
+                    list_to_binary(
+                        lists:nth(
+                            2,
+                            string:tokens(binary_to_list(IdxValue), "|")
+                        )),
+                case (DoB >= StartDoB) andalso (DoB =< EndDoB) of
+                    true ->
+                        {true, Key};
+                    false ->
+                        false
+                end
+            end,
+            Results0
+        ),
+
+    SW1 = os:timestamp(),
+    
+    WillowLeedsExtractor = 
+        "[^\\|]*\\|(?P<dob>[0-9]{8})\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|"
+        "[^\\|]*#LS[^\\|]*",
+    FilterFun1 =
+        fun([{<<"dob">>, DoB}]) ->
+            (DoB >= StartDoB) andalso (DoB =< EndDoB)
+        end,
+    {ok, WillowLeedsExtractorPCRE} = re:compile(WillowLeedsExtractor),
+    QueryPCRE1 =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {false,
+                {capture, WillowLeedsExtractorPCRE, [<<"dob">>], FilterFun1}}
+    },
+    {async, RunnerPCRE1} = leveled_bookie:book_returnfolder(Book1, QueryPCRE1),
+    BornMid70sPCRE1 = RunnerPCRE1(),
+
+    SW2 = os:timestamp(),
+
+    {ok, WillowLeedsExtractorRE2} = re2:compile(WillowLeedsExtractor),
+    QueryRE2_2 =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {false,
+                {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}}
+    },
+    {async, RunnerRE2_2} = leveled_bookie:book_returnfolder(Book1, QueryRE2_2),
+    BornMid70sRE2_2 = RunnerRE2_2(),
+
+    SW3 = os:timestamp(),
+
+    QueryRE2_3 =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {<<"dob">>,
+                {capture, WillowLeedsExtractorRE2, [<<"dob">>], fun(_) -> true end}}
+    },
+    {async, RunnerRE2_3} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3),
+    Results3 = RunnerRE2_3(),
+    BornMid70sRE2_3 =
+        lists:filtermap(
+            fun({DoB, Key}) ->
+                case (DoB >= StartDoB) andalso (DoB =< EndDoB) of
+                    true ->
+                        {true, Key};
+                    false ->
+                        false
+                end
+            end,
+            Results3
+        ),
+
+    SW4 = os:timestamp(),
+
+    WillowLeedsDoubleExtractor = 
+        "[^\\|]*\\|(?P<dob>[0-9]{8})\\|(?P<dod>[0-9]{0,8})\\|"
+        "[^\\|]*#Willow[^\\|]*\\|[^\\|]*#LS[^\\|]*",
+    
+    FilterFun2 =
+        fun(Captures) ->
+            {<<"dob">>, DoB} = hd(Captures),
+            (DoB >= StartDoB) andalso (DoB =< EndDoB)
+        end,
+    QueryRE2_4 =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {false,
+                {capture,
+                    element(2, re2:compile(WillowLeedsDoubleExtractor)),
+                    [<<"dob">>, <<"dod">>],
+                    FilterFun2}}
+    },
+    {async, RunnerRE2_4} = leveled_bookie:book_returnfolder(Book1, QueryRE2_4),
+    BornMid70sRE2_4 = RunnerRE2_4(),
+    
+    SW5 = os:timestamp(),
+
+    QueryRE2_5 =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {true,
+                {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}}
+    },
+    {async, RunnerRE2_5} = leveled_bookie:book_returnfolder(Book1, QueryRE2_5),
+    BornMid70sRE2_5 =
+        lists:filtermap(
+            fun({T, K}) ->
+                {match, _} =
+                    leveled_util:regex_run(T, WillowLeedsExtractorRE2, []),
+                {true, K}
+            end,
+            RunnerRE2_5()),
+
+    true = length(BornMid70s0) > 0,
+    true = length(BornMid70sPCRE1) > 0,
+    true = length(BornMid70sRE2_2) > 0,
+    true = length(BornMid70sRE2_3) > 0,
+    true = length(BornMid70sRE2_4) > 0,
+    true = length(BornMid70sRE2_5) > 0,
+
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sPCRE1),
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_2),
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_3),
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_4),
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_5),
+
+    io:format(
+        % user,
+        "~nFilter outside took ~w ms~n",
+        [timer:now_diff(SW1, SW0) div 1000]),
+    io:format(
+        % user,
+        "~nPCRE Capture filter inside took ~w ms~n",
+        [timer:now_diff(SW2, SW1) div 1000]),
+    io:format(
+        % user,
+        "~nRE2 Capture filter inside took ~w ms~n",
+        [timer:now_diff(SW3, SW2) div 1000]),
+    io:format(
+        % user,
+        "~nRE2 Capture filter outside took ~w ms~n",
+        [timer:now_diff(SW4, SW3) div 1000]),
+    io:format(
+        % user,
+        "~nRE2 double-capture filter outside took ~w ms~n",
+        [timer:now_diff(SW5, SW4) div 1000]),
+
+    ok = leveled_bookie:book_close(Book1),
+    
+    testutil:reset_filestructure().
+
+
 count_termsonindex(Bucket, IdxField, Book, QType) ->
     lists:foldl(
         fun(X, Acc) ->
diff --git a/test/end_to_end/perf_SUITE.erl b/test/end_to_end/perf_SUITE.erl
index 93f78f10..27549f54 100644
--- a/test/end_to_end/perf_SUITE.erl
+++ b/test/end_to_end/perf_SUITE.erl
@@ -5,6 +5,7 @@
 -export([
     riak_ctperf/1, riak_fullperf/1, riak_profileperf/1, riak_miniperf/1
 ]).
+-export([random_people_index/0]).
 
 -define(PEOPLE_INDEX, <<"people_bin">>).
 -define(MINI_QUERY_DIVISOR, 8).

From f54d75c72c51ac2f0aba1ca063d176c22f1f2f24 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 24 Apr 2024 13:37:19 +0100
Subject: [PATCH 2/8] Create leveled_filter.erl

First draft of allowing an externally provided comparison expression to be validated, parsed and evaluated
---
 src/leveled_filter.erl | 174 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 src/leveled_filter.erl

diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl
new file mode 100644
index 00000000..3d9d04b2
--- /dev/null
+++ b/src/leveled_filter.erl
@@ -0,0 +1,174 @@
+%% -------- Filter Functions ---------
+%%
+%% Support for different filter expressions within leveled 
+%%
+
+-module(leveled_filter).
+
+-export([validate_comparison_expression/2]).
+
+%%%============================================================================
+%%% External API
+%%%============================================================================
+
+%% @doc validate a comaprison expression
+%% A comparison expression allows for string comparisons to be made using
+%% >, <, >=, =< with andalso, or as well as () to group different expressions
+%% The output is a function which will be passed a Map where the Map may be the
+%% result of reading some projected attributes on an index string.
+%% To compare a string with a key in the map, the name of the key must begin
+%% with a "$".
+%% In creating the function, the potential keys must be known and passed as a
+%% string in the list (without the leading $). 
+-spec validate_comparison_expression(
+    string(), list(string())) ->
+        fun((map()) -> {value, boolean(), map()}) |
+        {error, string(), erl_anno:location()}.
+validate_comparison_expression(
+        Expression, AvailableArgs) when is_list(Expression) ->
+    case erl_scan:string(Expression) of
+        {ok, Tokens, _EndLoc} ->
+            case vert_compexpr_tokens(Tokens, AvailableArgs, []) of
+                {ok, UpdTokens} ->
+                    case erl_parse:parse_exprs(UpdTokens) of
+                        {ok, [Form]} ->
+                            io:format("Form ~p~n", [Form]),
+                            fun(LookupMap) ->
+                                erl_eval:expr(Form, LookupMap)
+                            end;
+                        {error, ErrorInfo} ->
+                            {error, ErrorInfo, undefined}
+                    end;
+                {error, Error, ErrorLocation} ->
+                    {error, lists:flatten(Error), ErrorLocation}
+            end;
+        {error, Error, ErrorLocation} ->
+            {error, Error, ErrorLocation}
+    end;
+validate_comparison_expression(_Expression, _AvailableArgs) ->
+    {error, "Expression not a valid string", undefined}.
+
+%%%============================================================================
+%%% Internal functions
+%%%============================================================================
+
+vert_compexpr_tokens([], _Args, ParsedTokens) ->
+    {ok, lists:reverse(ParsedTokens)};
+vert_compexpr_tokens([{dot, LN}], Args, ParsedTokens) ->
+    vert_compexpr_tokens([], Args, [{dot, LN}|ParsedTokens]);
+vert_compexpr_tokens(L, _Args, _ParsedTokens) when length(L) == 1 ->
+    {error, "Query does not end with `.`", undefined};
+vert_compexpr_tokens([{string, LN, String}|Rest], Args, ParsedTokens) ->
+    case hd(String) of
+        36 -> % 36 == $
+            Arg = tl(String),
+            case lists:member(Arg, Args) of
+                true ->
+                    VarToken = {var, LN, list_to_atom(Arg)},
+                    vert_compexpr_tokens(Rest, Args, [VarToken|ParsedTokens]);
+                false ->
+                    {error, io_lib:format("Unavailable Arg ~s", [Arg]), LN}
+            end;
+        _ -> % Not a key, treat as a string
+            vert_compexpr_tokens(
+                Rest, Args, [{string, LN, String}|ParsedTokens])
+    end;
+vert_compexpr_tokens([{'(',LN}|Rest], Args, ParsedTokens) ->
+    vert_compexpr_tokens(Rest, Args, [{'(',LN}|ParsedTokens]);
+vert_compexpr_tokens([{')',LN}|Rest], Args, ParsedTokens) ->
+    vert_compexpr_tokens(Rest, Args, [{')',LN}|ParsedTokens]);
+vert_compexpr_tokens([{'andalso', LN}|Rest], Args, ParsedTokens) ->
+    vert_compexpr_tokens(Rest, Args, [{'andalso', LN}|ParsedTokens]);
+vert_compexpr_tokens([{'or', LN}|Rest], Args, ParsedTokens) ->
+    vert_compexpr_tokens(Rest, Args, [{'or', LN}|ParsedTokens]);
+vert_compexpr_tokens([{'>', LN}|Rest], Args, ParsedTokens) ->
+    vert_compexpr_tokens(Rest, Args, [{'>', LN}|ParsedTokens]);
+vert_compexpr_tokens([{'<', LN}|Rest], Args, ParsedTokens) ->
+    vert_compexpr_tokens(Rest, Args, [{'<', LN}|ParsedTokens]);
+vert_compexpr_tokens([{'>=', LN}|Rest], Args, ParsedTokens) ->
+    vert_compexpr_tokens(Rest, Args, [{'>=', LN}|ParsedTokens]);
+vert_compexpr_tokens([{'=<', LN}|Rest], Args, ParsedTokens) ->
+    vert_compexpr_tokens(Rest, Args, [{'=<', LN}|ParsedTokens]);
+vert_compexpr_tokens([InvalidToken|_Rest], _Args, _ParsedTokens) ->
+    {error, io_lib:format("Invalid token ~w", [InvalidToken]), undefined}.
+
+
+%%%============================================================================
+%%% Test
+%%%============================================================================
+
+-ifdef(TEST).
+
+-include_lib("eunit/include/eunit.hrl").
+
+good_querystring_test() ->
+    QueryString =
+        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
+        " or (\"$dod\" > \"20200101\" andalso \"$dod\" < \"20230101\").",
+    AvailableArgs = ["dob", "dod"],
+    F = validate_comparison_expression(QueryString, AvailableArgs),
+    M1 = maps:from_list([{dob, "19750202"}, {dod, "20221216"}]),
+    M2 = maps:from_list([{dob, "19750202"}, {dod, "20191216"}]),
+    M3 = maps:from_list([{dob, "19790202"}, {dod, "20221216"}]),
+    M4 = maps:from_list([{dob, "19790202"}, {dod, "20191216"}]),
+    M5 = maps:from_list([{dob, "19790202"}, {dod, "20241216"}]),
+    ?assertMatch(true, element(2, F(M1))),
+    ?assertMatch(true, element(2, F(M2))),
+    ?assertMatch(true, element(2, F(M3))),
+    ?assertMatch(false, element(2, F(M4))),
+    ?assertMatch(false, element(2, F(M5))).
+
+bad_querystring_test() ->
+    QueryString =
+        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
+        " or (\"$dod\" > \"20200101\").",
+    BadString1 =
+        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
+        " or (\"$dod\" > \"20200101\")",
+    BadString2 =
+        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
+        " or (\"$dod\" > \"20200101\")).",
+    BadString3 =
+        "(\"$dob\" >= \"19740301\" and \"$dob\" =< \"19761030\")"
+        " or (\"$dod\" > \"20200101\").",
+    BadString4 = "os:cmd(foo)",
+    BadString5 =
+        [42,63,52,10,240,159,140,190] ++
+        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
+        " or (\"$dod\" > \"20200101\").",
+    BadString6 = [(16#110000 + 1)|QueryString],
+    BadString7 = <<42:32/integer>>,
+    ?assertMatch(
+        {error, "Unavailable Arg dod", 1},
+        validate_comparison_expression(QueryString, ["dob", "pv"])
+    ),
+    ?assertMatch(
+        {error, "Query does not end with `.`", undefined},
+        validate_comparison_expression(BadString1, ["dob", "dod"])
+    ),
+    ?assertMatch(
+        {error, {1 ,erl_parse, ["syntax error before: ","')'"]}, undefined},
+        validate_comparison_expression(BadString2, ["dob", "dod"])
+    ),
+    ?assertMatch(
+        {error, "Invalid token {'and',1}", undefined},
+        validate_comparison_expression(BadString3, ["dob", "dod"])
+    ),
+    ?assertMatch(
+        {error, "Invalid token {atom,1,os}", undefined},
+        validate_comparison_expression(BadString4, ["dob", "dod"])
+    ),
+    ?assertMatch(
+        {error, "Invalid token {'*',1}", undefined},
+        validate_comparison_expression(BadString5, ["dob", "dod"])
+    ),
+    ?assertMatch(
+        {error, {1, erl_scan, {illegal,character}}, 1},
+        validate_comparison_expression(BadString6, ["dob", "dod"])
+    ),
+    ?assertMatch(
+        {error, "Expression not a valid string", undefined},
+        validate_comparison_expression(BadString7, ["dob", "dod"])
+    ).
+
+-endif.

From 4a4a7ca9542a2717105d720ff9c6d13031b782fa Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 24 Apr 2024 17:44:45 +0100
Subject: [PATCH 3/8] Make binary comparisons in leveled_filter

Required as re2 can only output binary not string in captures.
---
 src/leveled_codec.erl              |  13 ++-
 src/leveled_filter.erl             |  51 ++++++------
 test/end_to_end/iterator_SUITE.erl | 123 +++++++++++++++++++++++------
 3 files changed, 132 insertions(+), 55 deletions(-)

diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index 1f655fde..7426fb23 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -114,9 +114,9 @@
         lookup|no_lookup.
 -type actual_regex() ::
         reference()|{re_pattern, term(), term(), term(), term()}.
--type capture_filter_fun() :: fun((list({binary(), binary()})) -> boolean()).
+-type capture_filter_fun() :: fun((#{atom() => string()}) -> boolean()).
 -type capture_reference() :: 
-        {capture, actual_regex(), list(binary()), capture_filter_fun()}.
+        {capture, actual_regex(), list(atom()), capture_filter_fun()}.
 -type regular_expression() ::
         actual_regex()|undefined|capture_reference().
 
@@ -314,17 +314,16 @@ accumulate_index(
     fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
         case leveled_util:regex_run(IdxValue, TermRegex, Opts) of
             {match, CptTerms} when length(CptTerms) == ExpectedKeys ->
-                CptPairs = lists:zip(CptKeys, CptTerms),
-                case FilterFun(CptPairs) of
+                CptMap = maps:from_list(lists:zip(CptKeys, CptTerms)),
+                case FilterFun(CptMap) of
                     true ->
                         case AddTerm of
                             true ->
                                 FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc);
                             false ->
                                 FoldKeysFun(Bucket, ObjKey, Acc);
-                            CptKey when is_binary(CptKey) ->
-                                {CptKey, CptValue} =
-                                    lists:keyfind(CptKey, 1, CptPairs),
+                            CptKey when is_atom(CptKey) ->
+                                CptValue = maps:get(CptKey, CptMap),
                                 FoldKeysFun(Bucket, {CptValue, ObjKey}, Acc)
                         end;
                     false ->
diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl
index 3d9d04b2..a273b128 100644
--- a/src/leveled_filter.erl
+++ b/src/leveled_filter.erl
@@ -11,9 +11,10 @@
 %%% External API
 %%%============================================================================
 
-%% @doc validate a comaprison expression
+%% @doc validate a comparison expression
 %% A comparison expression allows for string comparisons to be made using
-%% >, <, >=, =< with andalso, or as well as () to group different expressions
+%% >, <, >=, =< with andalso, orelse as well as () to group different
+%% expressions.
 %% The output is a function which will be passed a Map where the Map may be the
 %% result of reading some projected attributes on an index string.
 %% To compare a string with a key in the map, the name of the key must begin
@@ -32,9 +33,8 @@ validate_comparison_expression(
                 {ok, UpdTokens} ->
                     case erl_parse:parse_exprs(UpdTokens) of
                         {ok, [Form]} ->
-                            io:format("Form ~p~n", [Form]),
                             fun(LookupMap) ->
-                                erl_eval:expr(Form, LookupMap)
+                                element(2, erl_eval:expr(Form, LookupMap))
                             end;
                         {error, ErrorInfo} ->
                             {error, ErrorInfo, undefined}
@@ -69,9 +69,10 @@ vert_compexpr_tokens([{string, LN, String}|Rest], Args, ParsedTokens) ->
                 false ->
                     {error, io_lib:format("Unavailable Arg ~s", [Arg]), LN}
             end;
-        _ -> % Not a key, treat as a string
+        _ -> % Not a key, treat as a binary
+            BinString = [{'>>', LN}, {string, LN, String}, {'<<', LN}],
             vert_compexpr_tokens(
-                Rest, Args, [{string, LN, String}|ParsedTokens])
+                Rest, Args, BinString ++ ParsedTokens)
     end;
 vert_compexpr_tokens([{'(',LN}|Rest], Args, ParsedTokens) ->
     vert_compexpr_tokens(Rest, Args, [{'(',LN}|ParsedTokens]);
@@ -79,8 +80,8 @@ vert_compexpr_tokens([{')',LN}|Rest], Args, ParsedTokens) ->
     vert_compexpr_tokens(Rest, Args, [{')',LN}|ParsedTokens]);
 vert_compexpr_tokens([{'andalso', LN}|Rest], Args, ParsedTokens) ->
     vert_compexpr_tokens(Rest, Args, [{'andalso', LN}|ParsedTokens]);
-vert_compexpr_tokens([{'or', LN}|Rest], Args, ParsedTokens) ->
-    vert_compexpr_tokens(Rest, Args, [{'or', LN}|ParsedTokens]);
+vert_compexpr_tokens([{'orelse', LN}|Rest], Args, ParsedTokens) ->
+    vert_compexpr_tokens(Rest, Args, [{'orelse', LN}|ParsedTokens]);
 vert_compexpr_tokens([{'>', LN}|Rest], Args, ParsedTokens) ->
     vert_compexpr_tokens(Rest, Args, [{'>', LN}|ParsedTokens]);
 vert_compexpr_tokens([{'<', LN}|Rest], Args, ParsedTokens) ->
@@ -104,38 +105,38 @@ vert_compexpr_tokens([InvalidToken|_Rest], _Args, _ParsedTokens) ->
 good_querystring_test() ->
     QueryString =
         "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
-        " or (\"$dod\" > \"20200101\" andalso \"$dod\" < \"20230101\").",
+        " orelse (\"$dod\" > \"20200101\" andalso \"$dod\" < \"20230101\").",
     AvailableArgs = ["dob", "dod"],
     F = validate_comparison_expression(QueryString, AvailableArgs),
-    M1 = maps:from_list([{dob, "19750202"}, {dod, "20221216"}]),
-    M2 = maps:from_list([{dob, "19750202"}, {dod, "20191216"}]),
-    M3 = maps:from_list([{dob, "19790202"}, {dod, "20221216"}]),
-    M4 = maps:from_list([{dob, "19790202"}, {dod, "20191216"}]),
-    M5 = maps:from_list([{dob, "19790202"}, {dod, "20241216"}]),
-    ?assertMatch(true, element(2, F(M1))),
-    ?assertMatch(true, element(2, F(M2))),
-    ?assertMatch(true, element(2, F(M3))),
-    ?assertMatch(false, element(2, F(M4))),
-    ?assertMatch(false, element(2, F(M5))).
+    M1 = maps:from_list([{dob, <<"19750202">>}, {dod, <<"20221216">>}]),
+    M2 = maps:from_list([{dob, <<"19750202">>}, {dod, <<"20191216">>}]),
+    M3 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20221216">>}]),
+    M4 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20191216">>}]),
+    M5 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20241216">>}]),
+    ?assertMatch(true, F(M1)),
+    ?assertMatch(true, F(M2)),
+    ?assertMatch(true, F(M3)),
+    ?assertMatch(false, F(M4)),
+    ?assertMatch(false, F(M5)).
 
 bad_querystring_test() ->
     QueryString =
         "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
-        " or (\"$dod\" > \"20200101\").",
+        " orelse (\"$dod\" > \"20200101\").",
     BadString1 =
         "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
-        " or (\"$dod\" > \"20200101\")",
+        " orelse (\"$dod\" > \"20200101\")",
     BadString2 =
         "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
-        " or (\"$dod\" > \"20200101\")).",
+        " orelse (\"$dod\" > \"20200101\")).",
     BadString3 =
         "(\"$dob\" >= \"19740301\" and \"$dob\" =< \"19761030\")"
-        " or (\"$dod\" > \"20200101\").",
+        " orelse (\"$dod\" > \"20200101\").",
     BadString4 = "os:cmd(foo)",
     BadString5 =
         [42,63,52,10,240,159,140,190] ++
-        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
-        " or (\"$dod\" > \"20200101\").",
+        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< <<\"19761030\")"
+        " orelse (\"$dod\" > \"20200101\").",
     BadString6 = [(16#110000 + 1)|QueryString],
     BadString7 = <<42:32/integer>>,
     ?assertMatch(
diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl
index 899601ca..0cf0ebb5 100644
--- a/test/end_to_end/iterator_SUITE.erl
+++ b/test/end_to_end/iterator_SUITE.erl
@@ -1,6 +1,5 @@
 -module(iterator_SUITE).
 
--include_lib("common_test/include/ct.hrl").
 -include("include/leveled.hrl").
 
 -define(KEY_ONLY, {false, undefined}).
@@ -18,14 +17,14 @@
         ]).
 
 all() -> [
-            expiring_indexes,
-            breaking_folds,
-            single_object_with2i,
-            small_load_with2i,
-            query_count,
-            multibucket_fold,
-            rotating_objects,
-            foldobjects_bybucket_range,
+            % expiring_indexes,
+            % breaking_folds,
+            % single_object_with2i,
+            % small_load_with2i,
+            % query_count,
+            % multibucket_fold,
+            % rotating_objects,
+            % foldobjects_bybucket_range,
             capture_and_filter_terms
             ].
 
@@ -885,7 +884,8 @@ capture_and_filter_terms(_Config) ->
                         lists:nth(
                             2,
                             string:tokens(binary_to_list(IdxValue), "|")
-                        )),
+                        )
+                    ),
                 case (DoB >= StartDoB) andalso (DoB =< EndDoB) of
                     true ->
                         {true, Key};
@@ -902,7 +902,8 @@ capture_and_filter_terms(_Config) ->
         "[^\\|]*\\|(?P<dob>[0-9]{8})\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|"
         "[^\\|]*#LS[^\\|]*",
     FilterFun1 =
-        fun([{<<"dob">>, DoB}]) ->
+        fun(Captures) ->
+            DoB = maps:get(dob, Captures),
             (DoB >= StartDoB) andalso (DoB =< EndDoB)
         end,
     {ok, WillowLeedsExtractorPCRE} = re:compile(WillowLeedsExtractor),
@@ -912,7 +913,7 @@ capture_and_filter_terms(_Config) ->
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {false,
-                {capture, WillowLeedsExtractorPCRE, [<<"dob">>], FilterFun1}}
+                {capture, WillowLeedsExtractorPCRE, [dob], FilterFun1}}
     },
     {async, RunnerPCRE1} = leveled_bookie:book_returnfolder(Book1, QueryPCRE1),
     BornMid70sPCRE1 = RunnerPCRE1(),
@@ -926,7 +927,7 @@ capture_and_filter_terms(_Config) ->
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {false,
-                {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}}
+                {capture, WillowLeedsExtractorRE2, [dob], FilterFun1}}
     },
     {async, RunnerRE2_2} = leveled_bookie:book_returnfolder(Book1, QueryRE2_2),
     BornMid70sRE2_2 = RunnerRE2_2(),
@@ -938,8 +939,8 @@ capture_and_filter_terms(_Config) ->
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {<<"dob">>,
-                {capture, WillowLeedsExtractorRE2, [<<"dob">>], fun(_) -> true end}}
+            {dob,
+                {capture, WillowLeedsExtractorRE2, [dob], fun(_) -> true end}}
     },
     {async, RunnerRE2_3} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3),
     Results3 = RunnerRE2_3(),
@@ -964,7 +965,7 @@ capture_and_filter_terms(_Config) ->
     
     FilterFun2 =
         fun(Captures) ->
-            {<<"dob">>, DoB} = hd(Captures),
+            DoB = maps:get(dob, Captures),
             (DoB >= StartDoB) andalso (DoB =< EndDoB)
         end,
     QueryRE2_4 =
@@ -975,7 +976,7 @@ capture_and_filter_terms(_Config) ->
             {false,
                 {capture,
                     element(2, re2:compile(WillowLeedsDoubleExtractor)),
-                    [<<"dob">>, <<"dod">>],
+                    [dob, dod],
                     FilterFun2}}
     },
     {async, RunnerRE2_4} = leveled_bookie:book_returnfolder(Book1, QueryRE2_4),
@@ -989,7 +990,7 @@ capture_and_filter_terms(_Config) ->
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {true,
-                {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}}
+                {capture, WillowLeedsExtractorRE2, [dob], FilterFun1}}
     },
     {async, RunnerRE2_5} = leveled_bookie:book_returnfolder(Book1, QueryRE2_5),
     BornMid70sRE2_5 =
@@ -1001,18 +1002,82 @@ capture_and_filter_terms(_Config) ->
             end,
             RunnerRE2_5()),
 
+    SW6 = os:timestamp(),
+
+    ComparatorExpression3 =
+        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\") andalso (\"$dod\" > \"19000101\" orelse \"$dod\" < \"20501231\").",
+        
+    FilterFun3 =
+        leveled_filter:validate_comparison_expression(
+            ComparatorExpression3, ["dob", "dod"]),
+    QueryRE2_6 =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {false,
+                {capture,
+                    element(2, re2:compile(WillowLeedsDoubleExtractor)),
+                    [dob, dod],
+                    FilterFun3}}
+    },
+    {async, RunnerRE2_6} = leveled_bookie:book_returnfolder(Book1, QueryRE2_6),
+    BornMid70sRE2_6 = RunnerRE2_6(),
+
+    SW7 = os:timestamp(),
+    
+    ComparatorExpression4 =
+        "\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\".",
+        
+    FilterFun4 =
+        leveled_filter:validate_comparison_expression(
+            ComparatorExpression4, ["dob"]),
+    QueryRE2_7 =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {false,
+                {capture,
+                    element(2, re2:compile(WillowLeedsExtractor)),
+                    [dob],
+                    FilterFun4}}
+    },
+    {async, RunnerRE2_7} = leveled_bookie:book_returnfolder(Book1, QueryRE2_7),
+    BornMid70sRE2_7 = RunnerRE2_7(),
+
+    SW8 = os:timestamp(),
+
+    PreFilterRE =
+        "[^\\|]*\\|(?P<dob>197[4-6]{1}[0-9]{4})\\|"
+        "[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|"
+        "[^\\|]*#LS[^\\|]*",
+    QueryRE2_8 =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {false,
+                {capture,
+                    element(2, re2:compile(PreFilterRE)),
+                    [dob],
+                    FilterFun4}}
+    },
+    {async, RunnerRE2_8} = leveled_bookie:book_returnfolder(Book1, QueryRE2_8),
+    BornMid70sRE2_8 = RunnerRE2_8(),
+
+    SW9 = os:timestamp(),
+
     true = length(BornMid70s0) > 0,
-    true = length(BornMid70sPCRE1) > 0,
-    true = length(BornMid70sRE2_2) > 0,
-    true = length(BornMid70sRE2_3) > 0,
-    true = length(BornMid70sRE2_4) > 0,
-    true = length(BornMid70sRE2_5) > 0,
 
     true = lists:sort(BornMid70s0) == lists:sort(BornMid70sPCRE1),
     true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_2),
     true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_3),
     true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_4),
     true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_5),
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_6),
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_7),
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_8),
 
     io:format(
         % user,
@@ -1034,6 +1099,18 @@ capture_and_filter_terms(_Config) ->
         % user,
         "~nRE2 double-capture filter outside took ~w ms~n",
         [timer:now_diff(SW5, SW4) div 1000]),
+    io:format(
+        % user,
+        "~nRE2 double-capture filter with parsed query string took ~w ms~n",
+        [timer:now_diff(SW7, SW6) div 1000]),
+    io:format(
+        % user,
+        "~nRE2 single-capture filter with parsed query string took ~w ms~n",
+        [timer:now_diff(SW8, SW7) div 1000]),
+    io:format(
+        % user,
+        "~nRE2 single-capture pre-filter with parsed query string took ~w ms~n",
+        [timer:now_diff(SW9, SW8) div 1000]),
 
     ok = leveled_bookie:book_close(Book1),
     

From 2b61a1372918085ef12c573959e256e6899752bb Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Sat, 27 Apr 2024 22:15:17 +0100
Subject: [PATCH 4/8] Add Filter Expression support

Allow for capture with either delimited index terms, or with regex capture, and then filtering of captured attributes using a logical expression.

The xref check `locals_not_used` is now ignored due to issues with auto-generated module.
---
 .gitignore                         |   2 +
 rebar.config                       |   5 +-
 src/leveled_codec.erl              |  67 +++--
 src/leveled_filter.erl             | 436 ++++++++++++++++++++---------
 src/leveled_filterlexer.xrl        |  49 ++++
 src/leveled_filterparser.yrl       |  50 ++++
 test/end_to_end/iterator_SUITE.erl | 413 +++++++++++++--------------
 test/end_to_end/perf_SUITE.erl     |  61 +++-
 8 files changed, 703 insertions(+), 380 deletions(-)
 create mode 100644 src/leveled_filterlexer.xrl
 create mode 100644 src/leveled_filterparser.yrl

diff --git a/.gitignore b/.gitignore
index 3906bbcd..2dc6c0c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,5 @@ cover_*
 .eqc-info
 leveled_data/*
 compile_commands.json
+*parser.erl
+*lexer.erl
diff --git a/rebar.config b/rebar.config
index e73ff4b7..f2b6ff10 100644
--- a/rebar.config
+++ b/rebar.config
@@ -4,11 +4,11 @@
 
 {xref_checks,
   [undefined_function_calls,undefined_functions,
-    locals_not_used,
     deprecated_function_calls, deprecated_functions]}.
 
 {cover_excl_mods,
-  [testutil,
+  [leveled_filterlexer, leveled_filterparser,
+    testutil,
     appdefined_SUITE, basic_SUITE, iterator_SUITE,
     perf_SUITE, recovery_SUITE, riak_SUITE, tictac_SUITE]}.
 
@@ -23,6 +23,7 @@
    ]},
   {test, [{extra_src_dirs, ["test/end_to_end", "test/property"]}
    ]},
+  {perf_fexp, [{erl_opts, [{d, test_filter_expression}]}]},
   {perf_full, [{erl_opts, [{d, performance, riak_fullperf}]}]},
   {perf_mini, [{erl_opts, [{d, performance, riak_miniperf}]}]},
   {perf_prof, [{erl_opts, [{d, performance, riak_profileperf}]}]}
diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index 7426fb23..9270668e 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -116,7 +116,8 @@
         reference()|{re_pattern, term(), term(), term(), term()}.
 -type capture_filter_fun() :: fun((#{atom() => string()}) -> boolean()).
 -type capture_reference() :: 
-        {capture, actual_regex(), list(atom()), capture_filter_fun()}.
+        {capture, actual_regex(), list(atom()|binary()), capture_filter_fun()}|
+        {delimited, string(), list(binary()), capture_filter_fun()}.
 -type regular_expression() ::
         actual_regex()|undefined|capture_reference().
 
@@ -313,26 +314,39 @@ accumulate_index(
     Opts = [{capture, all_but_first, binary}],
     fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
         case leveled_util:regex_run(IdxValue, TermRegex, Opts) of
-            {match, CptTerms} when length(CptTerms) == ExpectedKeys ->
-                CptMap = maps:from_list(lists:zip(CptKeys, CptTerms)),
-                case FilterFun(CptMap) of
-                    true ->
-                        case AddTerm of
-                            true ->
-                                FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc);
-                            false ->
-                                FoldKeysFun(Bucket, ObjKey, Acc);
-                            CptKey when is_atom(CptKey) ->
-                                CptValue = maps:get(CptKey, CptMap),
-                                FoldKeysFun(Bucket, {CptValue, ObjKey}, Acc)
-                        end;
-                    false ->
-                        Acc
-                end;
+            {match, CptTerms} ->
+                L = min(length(CptTerms), ExpectedKeys),
+                check_captured_terms(
+                    lists:sublist(CptTerms, L),
+                    lists:sublist(CptKeys, L),
+                    FilterFun,
+                    AddTerm,
+                    FoldKeysFun,
+                    Bucket,
+                    IdxValue,
+                    ObjKey,
+                    Acc);
             _ ->
                 Acc
         end
     end;
+accumulate_index(
+        {AddTerm, {delimited, Delim, CptKeys, FilterFun}}, FoldKeysFun) ->
+    ExpectedKeys = length(CptKeys),
+    fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
+        CptTerms = string:split(IdxValue, Delim, all),
+        L = min(length(CptTerms), ExpectedKeys),
+        check_captured_terms(
+            lists:sublist(CptTerms, L),
+            lists:sublist(CptKeys, L),
+            FilterFun,
+            AddTerm,
+            FoldKeysFun,
+            Bucket,
+            IdxValue,
+            ObjKey,
+            Acc)
+    end;
 accumulate_index({AddTerm, TermRegex}, FoldKeysFun) ->
     fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
         case leveled_util:regex_run(IdxValue, TermRegex, []) of
@@ -348,6 +362,25 @@ accumulate_index({AddTerm, TermRegex}, FoldKeysFun) ->
         end
     end.
 
+check_captured_terms(
+        CptTerms, CptKeys, FilterFun, AddTerm, FoldKeysFun, B, IdxValue, ObjKey, Acc) ->
+    CptMap = maps:from_list(lists:zip(CptKeys, CptTerms)),
+    case FilterFun(CptMap) of
+        true ->
+            case AddTerm of
+                true ->
+                    FoldKeysFun(B, {IdxValue, ObjKey}, Acc);
+                false ->
+                    FoldKeysFun(B, ObjKey, Acc);
+                CptKey when is_atom(CptKey) ->
+                    CptValue = maps:get(CptKey, CptMap),
+                    FoldKeysFun(B, {CptValue, ObjKey}, Acc)
+            end;
+        false ->
+            Acc
+    end.
+
+
 
 -spec key_dominates(ledger_kv(), ledger_kv()) -> boolean().
 %% @doc
diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl
index a273b128..43fb1573 100644
--- a/src/leveled_filter.erl
+++ b/src/leveled_filter.erl
@@ -5,93 +5,135 @@
 
 -module(leveled_filter).
 
--export([validate_comparison_expression/2]).
+-export(
+    [
+        generate_filter_expression/2,
+        apply_filter/2
+    ]).
 
 %%%============================================================================
 %%% External API
 %%%============================================================================
 
-%% @doc validate a comparison expression
-%% A comparison expression allows for string comparisons to be made using
-%% >, <, >=, =< with andalso, orelse as well as () to group different
-%% expressions.
-%% The output is a function which will be passed a Map where the Map may be the
-%% result of reading some projected attributes on an index string.
-%% To compare a string with a key in the map, the name of the key must begin
-%% with a "$".
-%% In creating the function, the potential keys must be known and passed as a
-%% string in the list (without the leading $). 
--spec validate_comparison_expression(
-    string(), list(string())) ->
-        fun((map()) -> {value, boolean(), map()}) |
-        {error, string(), erl_anno:location()}.
-validate_comparison_expression(
-        Expression, AvailableArgs) when is_list(Expression) ->
-    case erl_scan:string(Expression) of
-        {ok, Tokens, _EndLoc} ->
-            case vert_compexpr_tokens(Tokens, AvailableArgs, []) of
-                {ok, UpdTokens} ->
-                    case erl_parse:parse_exprs(UpdTokens) of
-                        {ok, [Form]} ->
-                            fun(LookupMap) ->
-                                element(2, erl_eval:expr(Form, LookupMap))
-                            end;
-                        {error, ErrorInfo} ->
-                            {error, ErrorInfo, undefined}
-                    end;
-                {error, Error, ErrorLocation} ->
-                    {error, lists:flatten(Error), ErrorLocation}
+
+apply_filter({condition, Condition}, AttrMap) ->
+    apply_filter(Condition, AttrMap);
+apply_filter({'OR', P1, P2}, AttrMap) ->
+    apply_filter(P1, AttrMap) orelse apply_filter(P2, AttrMap);
+apply_filter({'AND', P1, P2}, AttrMap) ->
+    apply_filter(P1, AttrMap) andalso apply_filter(P2, AttrMap);
+apply_filter({'NOT', P1}, AttrMap) ->
+    not apply_filter(P1, AttrMap);
+apply_filter({'BETWEEN', {identifier, _, ID}, CompA, CompB}, AttrMap) ->
+    case maps:get(ID, AttrMap, notfound) of
+        notfound ->
+            false;
+        V ->
+            case {element(3, CompA), element(3, CompB)} of
+                {Low, High} when Low =< High ->
+                    V >= Low andalso V =< High;
+                {High, Low} ->
+                    V >= Low andalso V =< High
+            end
+    end;
+apply_filter({'IN', {identifier, _, ID}, CheckList}, AttrMap) ->
+    case maps:get(ID, AttrMap, notfound) of
+        notfound ->
+            false;
+        V ->
+            lists:member(V, lists:map(fun(C) -> element(3, C) end, CheckList))
+    end;
+apply_filter({{comparator, Cmp, _}, {identifier, _ , ID}, CmpA}, AttrMap) ->
+    case maps:get(ID, AttrMap, notfound) of
+        notfound ->
+            false;
+        V ->
+            case {element(1, CmpA), V} of
+                {string, V} when is_binary(V) ->
+                    compare(Cmp, V, element(3, CmpA));
+                {integer, V} when is_integer(V) ->
+                    compare(Cmp, V, element(3, CmpA));
+                _ ->
+                    false
+            end
+    end;
+apply_filter({contains, {identifier, _, ID}, {string, _ , SubStr}}, AttrMap) ->
+    case maps:get(ID, AttrMap, notfound) of
+        V when is_binary(V) ->
+            case string:find(V, SubStr) of
+                nomatch ->
+                    false;
+                _ ->
+                    true
             end;
-        {error, Error, ErrorLocation} ->
-            {error, Error, ErrorLocation}
+        _ ->
+            false
     end;
-validate_comparison_expression(_Expression, _AvailableArgs) ->
-    {error, "Expression not a valid string", undefined}.
+apply_filter({begins_with, {identifier, _, ID}, {string, _ , SubStr}}, AttrMap) ->
+    case maps:get(ID, AttrMap, notfound) of
+        V when is_binary(V) ->
+            case string:prefix(V, SubStr) of
+                nomatch ->
+                    false;
+                _ ->
+                    true
+            end;
+        _ ->
+            false
+    end;
+apply_filter({attribute_exists, {identifier, _, ID}}, AttrMap) ->
+    maps:is_key(ID, AttrMap);
+apply_filter({attribute_not_exists, {identifier, _, ID}}, AttrMap) ->
+    not maps:is_key(ID, AttrMap);
+apply_filter({attribute_empty, {identifier, _, ID}}, AttrMap) ->
+    case maps:get(ID, AttrMap, notfound) of
+        <<>> ->
+            true;
+        _ ->
+            false
+    end
+.
+
+generate_filter_expression(FilterString, Substitutions) ->
+    {ok, Tokens, _EndLine} = leveled_filterlexer:string(FilterString),
+    case substitute_items(Tokens, Substitutions, []) of
+        {error, Error} ->
+            {error, Error};
+        UpdTokens ->
+            leveled_filterparser:parse(UpdTokens)
+    end.
 
 %%%============================================================================
 %%% Internal functions
 %%%============================================================================
 
-vert_compexpr_tokens([], _Args, ParsedTokens) ->
-    {ok, lists:reverse(ParsedTokens)};
-vert_compexpr_tokens([{dot, LN}], Args, ParsedTokens) ->
-    vert_compexpr_tokens([], Args, [{dot, LN}|ParsedTokens]);
-vert_compexpr_tokens(L, _Args, _ParsedTokens) when length(L) == 1 ->
-    {error, "Query does not end with `.`", undefined};
-vert_compexpr_tokens([{string, LN, String}|Rest], Args, ParsedTokens) ->
-    case hd(String) of
-        36 -> % 36 == $
-            Arg = tl(String),
-            case lists:member(Arg, Args) of
-                true ->
-                    VarToken = {var, LN, list_to_atom(Arg)},
-                    vert_compexpr_tokens(Rest, Args, [VarToken|ParsedTokens]);
-                false ->
-                    {error, io_lib:format("Unavailable Arg ~s", [Arg]), LN}
-            end;
-        _ -> % Not a key, treat as a binary
-            BinString = [{'>>', LN}, {string, LN, String}, {'<<', LN}],
-            vert_compexpr_tokens(
-                Rest, Args, BinString ++ ParsedTokens)
+compare('>', V, CmpA) -> V > CmpA;
+compare('>=', V, CmpA) -> V >= CmpA;
+compare('<', V, CmpA) -> V < CmpA;
+compare('<=', V, CmpA) -> V =< CmpA;
+compare('=', V, CmpA) -> V == CmpA;
+compare('<>', V, CmpA) -> V =/= CmpA.
+
+substitute_items([], _Subs, UpdTokens) ->
+    lists:reverse(UpdTokens);
+substitute_items([{substitution, LN, ID}|Rest], Subs, UpdTokens) ->
+    case maps:get(ID, Subs, notfound) of
+        notfound ->
+            {error,
+                lists:flatten(
+                    io_lib:format("Substitution ~p not found", [ID]))};
+        Value when is_binary(Value) ->
+            substitute_items(
+                Rest, Subs, [{string, LN, binary_to_list(Value)}|UpdTokens]);
+        Value when is_integer(Value) ->
+            substitute_items(Rest, Subs, [{integer, LN, Value}|UpdTokens]);
+        _UnexpectedValue ->
+            {error,
+                lists:flatten(
+                    io_lib:format("Substitution ~p unexpected type", [ID]))}
     end;
-vert_compexpr_tokens([{'(',LN}|Rest], Args, ParsedTokens) ->
-    vert_compexpr_tokens(Rest, Args, [{'(',LN}|ParsedTokens]);
-vert_compexpr_tokens([{')',LN}|Rest], Args, ParsedTokens) ->
-    vert_compexpr_tokens(Rest, Args, [{')',LN}|ParsedTokens]);
-vert_compexpr_tokens([{'andalso', LN}|Rest], Args, ParsedTokens) ->
-    vert_compexpr_tokens(Rest, Args, [{'andalso', LN}|ParsedTokens]);
-vert_compexpr_tokens([{'orelse', LN}|Rest], Args, ParsedTokens) ->
-    vert_compexpr_tokens(Rest, Args, [{'orelse', LN}|ParsedTokens]);
-vert_compexpr_tokens([{'>', LN}|Rest], Args, ParsedTokens) ->
-    vert_compexpr_tokens(Rest, Args, [{'>', LN}|ParsedTokens]);
-vert_compexpr_tokens([{'<', LN}|Rest], Args, ParsedTokens) ->
-    vert_compexpr_tokens(Rest, Args, [{'<', LN}|ParsedTokens]);
-vert_compexpr_tokens([{'>=', LN}|Rest], Args, ParsedTokens) ->
-    vert_compexpr_tokens(Rest, Args, [{'>=', LN}|ParsedTokens]);
-vert_compexpr_tokens([{'=<', LN}|Rest], Args, ParsedTokens) ->
-    vert_compexpr_tokens(Rest, Args, [{'=<', LN}|ParsedTokens]);
-vert_compexpr_tokens([InvalidToken|_Rest], _Args, _ParsedTokens) ->
-    {error, io_lib:format("Invalid token ~w", [InvalidToken]), undefined}.
+substitute_items([Token|Rest], Subs, UpdTokens) ->
+    substitute_items(Rest, Subs, [Token|UpdTokens]).
 
 
 %%%============================================================================
@@ -102,74 +144,198 @@ vert_compexpr_tokens([InvalidToken|_Rest], _Args, _ParsedTokens) ->
 
 -include_lib("eunit/include/eunit.hrl").
 
-good_querystring_test() ->
-    QueryString =
-        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
-        " orelse (\"$dod\" > \"20200101\" andalso \"$dod\" < \"20230101\").",
-    AvailableArgs = ["dob", "dod"],
-    F = validate_comparison_expression(QueryString, AvailableArgs),
-    M1 = maps:from_list([{dob, <<"19750202">>}, {dod, <<"20221216">>}]),
-    M2 = maps:from_list([{dob, <<"19750202">>}, {dod, <<"20191216">>}]),
-    M3 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20221216">>}]),
-    M4 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20191216">>}]),
-    M5 = maps:from_list([{dob, <<"19790202">>}, {dod, <<"20241216">>}]),
-    ?assertMatch(true, F(M1)),
-    ?assertMatch(true, F(M2)),
-    ?assertMatch(true, F(M3)),
-    ?assertMatch(false, F(M4)),
-    ?assertMatch(false, F(M5)).
-
-bad_querystring_test() ->
-    QueryString =
-        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
-        " orelse (\"$dod\" > \"20200101\").",
-    BadString1 =
-        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
-        " orelse (\"$dod\" > \"20200101\")",
-    BadString2 =
-        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\")"
-        " orelse (\"$dod\" > \"20200101\")).",
-    BadString3 =
-        "(\"$dob\" >= \"19740301\" and \"$dob\" =< \"19761030\")"
-        " orelse (\"$dod\" > \"20200101\").",
-    BadString4 = "os:cmd(foo)",
-    BadString5 =
-        [42,63,52,10,240,159,140,190] ++
-        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< <<\"19761030\")"
-        " orelse (\"$dod\" > \"20200101\").",
-    BadString6 = [(16#110000 + 1)|QueryString],
-    BadString7 = <<42:32/integer>>,
-    ?assertMatch(
-        {error, "Unavailable Arg dod", 1},
-        validate_comparison_expression(QueryString, ["dob", "pv"])
-    ),
+invalid_filterexpression_test() ->
+    FE1 = "($a BETWEEN \"A\" AND \"A12\") OR (($b >= \"30\") AND contains($c, :d))",
+    SubsMissing = maps:from_list([{<<"a">>, <<"MA">>}]),
     ?assertMatch(
-        {error, "Query does not end with `.`", undefined},
-        validate_comparison_expression(BadString1, ["dob", "dod"])
+        {error, "Substitution <<\"d\">> not found"},
+        generate_filter_expression(FE1, SubsMissing)
     ),
+    SubsWrongType = maps:from_list([{<<"d">>, "42"}]),
     ?assertMatch(
-        {error, {1 ,erl_parse, ["syntax error before: ","')'"]}, undefined},
-        validate_comparison_expression(BadString2, ["dob", "dod"])
+        {error, "Substitution <<\"d\">> unexpected type"},
+        generate_filter_expression(FE1, SubsWrongType)
     ),
+    SubsPresent = maps:from_list([{<<"d">>, <<"MA">>}]),
+    FE2 = "($a BETWEEN \"A\" AND 12) OR (($b >= \"30\") AND contains($c, :d))",
     ?assertMatch(
-        {error, "Invalid token {'and',1}", undefined},
-        validate_comparison_expression(BadString3, ["dob", "dod"])
+        {error, {1, leveled_filterparser,["syntax error before: ","12"]}},
+        generate_filter_expression(FE2, SubsPresent)
     ),
+    SubsWrongTypeForContains = maps:from_list([{<<"d">>, 42}]),
+    FE4 = "($a BETWEEN 12 AND 12) OR (($b >= \"30\") AND contains($c, :d))",
     ?assertMatch(
-        {error, "Invalid token {atom,1,os}", undefined},
-        validate_comparison_expression(BadString4, ["dob", "dod"])
-    ),
-    ?assertMatch(
-        {error, "Invalid token {'*',1}", undefined},
-        validate_comparison_expression(BadString5, ["dob", "dod"])
-    ),
-    ?assertMatch(
-        {error, {1, erl_scan, {illegal,character}}, 1},
-        validate_comparison_expression(BadString6, ["dob", "dod"])
-    ),
-    ?assertMatch(
-        {error, "Expression not a valid string", undefined},
-        validate_comparison_expression(BadString7, ["dob", "dod"])
+        {error, {1, leveled_filterparser, ["syntax error before: ","42"]}},
+        generate_filter_expression(FE4, SubsWrongTypeForContains)
     ).
 
+filterexpression_test() ->
+    FE1 = "($a BETWEEN \"A\" AND \"A12\") AND (($b >= 30) AND contains($c, :d))",
+    SubsPresent = maps:from_list([{<<"d">>, <<"MA">>}]),
+    {ok, Filter1} = generate_filter_expression(FE1, SubsPresent),
+    M1 = #{<<"a">> => <<"A11">>, <<"b">> => 100, <<"c">> => <<"CARTMAN">>},
+    ?assert(apply_filter(Filter1, M1)),
+        % ok
+    
+    M2 = #{<<"a">> => <<"A11">>, <<"b">> => 10, <<"c">> => <<"CARTMAN">>},
+    ?assertNot(apply_filter(Filter1, M2)),
+        % $b < 30
+    
+    FE2 = "($a BETWEEN \"A\" AND \"A12\") AND (($b >= 30) OR contains($c, :d))",
+    {ok, Filter2} = generate_filter_expression(FE2, SubsPresent),
+    ?assert(apply_filter(Filter2, M2)),
+        % OR used so ($b >= 30) = false is ok
+    
+    FE3 = "($a BETWEEN \"A12\" AND \"A\") AND (($b >= 30) OR contains($c, :d))",
+    {ok, Filter3} = generate_filter_expression(FE3, SubsPresent),
+    ?assert(apply_filter(Filter3, M2)),
+        % swapping the low/high - still ok
+    
+    M3 = #{<<"a">> => <<"A11">>, <<"b">> => <<"100">>, <<"c">> => <<"CARTMAN">>},
+    ?assertNot(apply_filter(Filter1, M3)),
+        % substitution b is not an integer
+    
+    FE4 =
+        "($dob BETWEEN \"19700101\" AND \"19791231\") "
+        "AND (contains($gns, \"#Willow\") AND contains($pcs, \"#LS\"))",
+    {ok, Filter4} = generate_filter_expression(FE4, maps:new()),
+    M4 =
+        #{
+            <<"dob">> => <<"19751124">>,
+            <<"gns">> => <<"#Mia#Willow#Chloe">>,
+            <<"pcs">> => <<"#BD1 1DU#LS1 4BT">>
+        },
+    ?assert(apply_filter(Filter4, M4)),
+
+    FE5 =
+        "($dob >= \"19740301\" AND $dob <= \"19761030\")"
+        " OR ($dod > \"20200101\" AND $dod < \"20230101\")",
+    
+    {ok, Filter5} = generate_filter_expression(FE5, maps:new()),
+    F = fun(M) -> apply_filter(Filter5, M) end,
+
+    M5 = maps:from_list([{<<"dob">>, <<"19750202">>}, {<<"dod">>, <<"20221216">>}]),
+    M6 = maps:from_list([{<<"dob">>, <<"19750202">>}, {<<"dod">>, <<"20191216">>}]),
+    M7 = maps:from_list([{<<"dob">>, <<"19790202">>}, {<<"dod">>, <<"20221216">>}]),
+    M8 = maps:from_list([{<<"dob">>, <<"19790202">>}, {<<"dod">>, <<"20191216">>}]),
+    M9 = maps:from_list([{<<"dob">>, <<"19790202">>}, {<<"dod">>, <<"20241216">>}]),
+    M10 = maps:new(),
+    ?assertMatch(true, F(M5)),
+    ?assertMatch(true, F(M6)),
+    ?assertMatch(true, F(M7)),
+    ?assertMatch(false, F(M8)),
+    ?assertMatch(false, F(M9)),
+    ?assertMatch(false, F(M10)),
+
+    FE5A =
+        "($dob >= \"19740301\" AND $dob <= \"19761030\")"
+        " AND ($dod = \"20221216\")",
+    {ok, Filter5A} = generate_filter_expression(FE5A, maps:new()),
+    ?assert(apply_filter(Filter5A, M5)),
+    ?assertNot(apply_filter(Filter5A, M6)),
+
+    FE6 =
+        "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")"
+        " OR $dob <> \"19993112\"",
+    {ok, Filter6} = generate_filter_expression(FE6, maps:new()),
+    M11 = maps:from_list([{<<"dob">>, <<"19993112">>}]),
+    ?assertMatch(false, apply_filter(Filter6, M11)),
+
+    FE7 =
+        "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")"
+        " OR $dob = \"19993112\"",
+    {ok, Filter7} = generate_filter_expression(FE7, maps:new()),
+    ?assert(apply_filter(Filter7, M11)),
+
+    FE8 = "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")"
+        " OR $dob IN (\"19910301\", \"19910103\")",
+    {ok, Filter8} = generate_filter_expression(FE8, maps:new()),
+    ?assert(apply_filter(Filter8, #{<<"dob">> => <<"19910301">>})),
+    ?assert(apply_filter(Filter8, #{<<"dob">> => <<"19910103">>})),
+    ?assertNot(apply_filter(Filter8, #{<<"dob">> => <<"19910102">>})),
+    ?assertNot(apply_filter(Filter8, #{<<"gn">> => <<"Nikki">>})),
+
+    FE9 = "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")"
+        " OR $dob IN (\"19910301\", 19910103)",
+        % Only match with a type match
+    {ok, Filter9} = generate_filter_expression(FE9, maps:new()),
+    ?assert(apply_filter(Filter9, #{<<"dob">> => <<"19910301">>})),
+    ?assert(apply_filter(Filter9, #{<<"dob">> => 19910103})),
+    ?assertNot(apply_filter(Filter9, #{<<"dob">> => 19910301})),
+    ?assertNot(apply_filter(Filter9, #{<<"dob">> => <<"19910103">>})),
+
+    FE10 = "NOT contains($gn, \"MA\") AND "
+            "(NOT $dob IN (\"19910301\", \"19910103\"))",
+    {ok, Filter10} = generate_filter_expression(FE10, maps:new()),
+    ?assert(
+        apply_filter(
+            Filter10,
+            #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910201">>})),
+    ?assertNot(
+        apply_filter(
+            Filter10,
+            #{<<"gn">> => <<"EMMA">>, <<"dob">> => <<"19910201">>})),
+    ?assertNot(
+        apply_filter(
+            Filter10,
+            #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910301">>})),
+    
+    FE11 = "NOT contains($gn, \"MA\") AND "
+        "NOT $dob IN (\"19910301\", \"19910103\")",
+    {ok, Filter11} = generate_filter_expression(FE11, maps:new()),
+    ?assert(
+        apply_filter(
+            Filter11,
+            #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910201">>})),
+    ?assertNot(
+        apply_filter(
+            Filter11,
+            #{<<"gn">> => <<"EMMA">>, <<"dob">> => <<"19910201">>})),
+    ?assertNot(
+        apply_filter(
+            Filter11,
+            #{<<"gn">> => <<"JAMES">>, <<"dob">> => <<"19910301">>})),
+    
+    FE12 = "begins_with($gn, \"MA\") AND begins_with($fn, :fn)",
+    {ok, Filter12} = generate_filter_expression(FE12, #{<<"fn">> => <<"SU">>}),
+    ?assert(
+        apply_filter(
+            Filter12,
+            #{<<"gn">> => <<"MATTY">>, <<"fn">> => <<"SUMMER">>})),
+    ?assertNot(
+        apply_filter(
+            Filter12,
+            #{<<"gn">> => <<"MITTY">>, <<"fn">> => <<"SUMMER">>})),
+    ?assertNot(
+        apply_filter(
+            Filter12,
+            #{<<"gn">> => <<"MATTY">>, <<"fn">> => <<"SIMMS">>})),
+    ?assertNot(
+        apply_filter(
+            Filter12,
+            #{<<"gn">> => 42, <<"fn">> => <<"SUMMER">>})),
+
+    FE13 = "attribute_exists($dob) AND attribute_not_exists($consent) "
+            "AND attribute_empty($dod)",
+    {ok, Filter13} = generate_filter_expression(FE13, maps:new()),
+    ?assert(
+        apply_filter(
+            Filter13,
+            #{<<"dob">> => <<"19440812">>, <<"dod">> => <<>>})),
+    ?assertNot(
+        apply_filter(
+            Filter13,
+            #{<<"dod">> => <<>>})),
+    ?assertNot(
+        apply_filter(
+            Filter13,
+            #{<<"dob">> => <<"19440812">>,
+                <<"consent">> => <<>>,
+                <<"dod">> => <<>>})),
+    ?assertNot(
+        apply_filter(
+            Filter13,
+            #{<<"dob">> => <<"19440812">>, <<"dod">> => <<"20240213">>}))
+    .
+
 -endif.
diff --git a/src/leveled_filterlexer.xrl b/src/leveled_filterlexer.xrl
new file mode 100644
index 00000000..9fea88ff
--- /dev/null
+++ b/src/leveled_filterlexer.xrl
@@ -0,0 +1,49 @@
+%% Lexer for filter and conditional expressions
+%% Author: Thomas Arts
+
+Definitions.
+WhiteSpace  = ([\t\f\v\r\n\s]+)
+
+Rules.
+
+{WhiteSpace} : skip_token.
+
+\( : {token, {'(', TokenLine}}.
+\) : {token, {')', TokenLine}}.
+
+,       : {token, {',', TokenLine}}.
+NOT     : {token, {'NOT', TokenLine}}.
+AND     : {token, {'AND', TokenLine}}.
+OR      : {token, {'OR', TokenLine}}.
+BETWEEN : {token, {'BETWEEN', TokenLine}}.
+IN      : {token, {'IN', TokenLine}}.
+=       : {token, {comparator, '=', TokenLine}}.
+<       : {token, {comparator, '<', TokenLine}}.
+>       : {token, {comparator, '>', TokenLine}}.
+<>      : {token, {comparator, '<>', TokenLine}}.
+<=      : {token, {comparator, '<=', TokenLine}}.
+>=      : {token, {comparator, '>=', TokenLine}}.
+
+contains             : {token, {'contains', TokenLine}}.
+begins_with          : {token, {'begins_with', TokenLine}}.
+attribute_exists     : {token, {'attribute_exists', TokenLine}}.
+attribute_not_exists : {token, {'attribute_not_exists', TokenLine}}.
+attribute_empty      : {token, {'attribute_empty', TokenLine}}.
+
+\$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}.
+\:[a-zA-Z_][a-zA-Z_0-9]* : {token, {substitution, TokenLine, strip_substitution(TokenChars)}}.
+[0-9]+                   : {token, {integer, TokenLine, list_to_integer(TokenChars)}}.
+\"[^"]*\"                : {token, {string, TokenLine, strip_string(TokenChars)}}. %"
+
+Erlang code.
+
+strip_string(TokenChars) ->
+    list_to_binary(lists:sublist(TokenChars, 2, length(TokenChars) -2)).
+
+strip_identifier(TokenChars) ->
+    [36|StrippedChars] = TokenChars,
+    list_to_binary(StrippedChars).
+
+strip_substitution(TokenChars) ->
+    [58|StrippedChars] = TokenChars,
+    list_to_binary(StrippedChars).
\ No newline at end of file
diff --git a/src/leveled_filterparser.yrl b/src/leveled_filterparser.yrl
new file mode 100644
index 00000000..d9253590
--- /dev/null
+++ b/src/leveled_filterparser.yrl
@@ -0,0 +1,50 @@
+%% Grammar for filter expressions
+%% Author: Thomas Arts
+
+Nonterminals
+top_level condition operand operand_list operands.
+
+
+Terminals
+'(' ')' comparator identifier string integer
+','
+'NOT' 'AND' 'OR' 'IN' 'BETWEEN'
+'contains' 'begins_with'
+'attribute_exists' 'attribute_not_exists' 'attribute_empty'.
+
+
+Rootsymbol top_level.
+
+top_level -> condition: {condition, '$1'}.
+
+condition -> operand comparator operand    : {'$2', '$1', '$3'}.
+condition -> operand 'BETWEEN' string 'AND' string   : {'BETWEEN', '$1', '$3', '$5'}.
+condition -> operand 'BETWEEN' integer 'AND' integer   : {'BETWEEN', '$1', '$3', '$5'}.
+condition -> operand 'IN' operand_list     : {'IN', '$1', '$3'}.
+condition -> condition 'AND' condition     : {'AND', '$1', '$3'}.
+condition -> condition 'OR' condition      : {'OR', '$1', '$3'}.
+condition -> 'NOT' condition               : {'NOT', '$2'}.
+condition -> 'contains' '(' identifier ',' string ')'       : {'contains', '$3', '$5'}.
+condition -> 'begins_with' '(' identifier ',' string ')'    : {'begins_with', '$3', '$5'}.
+condition -> 'attribute_exists' '(' identifier ')'          : {'attribute_exists', '$3'}.
+condition -> 'attribute_not_exists' '(' identifier ')'      : {'attribute_not_exists', '$3'}.
+condition -> 'attribute_empty' '(' identifier ')'           : {'attribute_empty', '$3'}.
+condition -> '(' condition ')'             : '$2'.
+
+operand -> identifier   : '$1'.
+operand -> string       : '$1'.
+operand -> integer      : '$1'.
+
+operand_list -> '(' operands ')' : '$2'.
+
+operands -> operand ',' operands : ['$1' | '$3'].
+operands -> operand              : ['$1'].
+
+Endsymbol '$end'.
+
+Right 200 'NOT'.
+Nonassoc 200 comparator.
+Left 150 'AND'.
+Left 100 'OR'.
+
+Erlang code.
diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl
index 0cf0ebb5..955c036e 100644
--- a/test/end_to_end/iterator_SUITE.erl
+++ b/test/end_to_end/iterator_SUITE.erl
@@ -17,18 +17,17 @@
         ]).
 
 all() -> [
-            % expiring_indexes,
-            % breaking_folds,
-            % single_object_with2i,
-            % small_load_with2i,
-            % query_count,
-            % multibucket_fold,
-            % rotating_objects,
-            % foldobjects_bybucket_range,
+            expiring_indexes,
+            breaking_folds,
+            single_object_with2i,
+            small_load_with2i,
+            query_count,
+            multibucket_fold,
+            rotating_objects,
+            foldobjects_bybucket_range,
             capture_and_filter_terms
             ].
 
-
 expiring_indexes(_Config) ->
     % Add objects to the store with index entries, where the objects (and hence
     % the indexes have an expiry time.  Confirm that the indexes and the
@@ -50,11 +49,11 @@ expiring_indexes(_Config) ->
     SW1 = os:timestamp(),
     timer:sleep(1000),
 
-    V9 = testutil:get_compressiblevalue(),
+    V1 = <<"V1">>,
     Indexes9 = testutil:get_randomindexes_generator(2),
     TempRiakObjects =
         testutil:generate_objects(
-            KeyCount, binary_uuid, [], V9, Indexes9, "riakBucket"),
+            KeyCount, binary_uuid, [], V1, Indexes9, "riakBucket"),
     
     IBKL1 = testutil:stdload_expiring(Bookie1, KeyCount, Future),
     lists:foreach(
@@ -144,11 +143,6 @@ expiring_indexes(_Config) ->
         Bookie1, B0, K0, 5, <<"value">>, leveled_util:integer_now() + 10),
     timer:sleep(1000),
     {async, Folder2} = IndexFold(),
-        leveled_bookie:book_indexfold(Bookie1,
-                                        B0,
-                                        {FoldFun, InitAcc},
-                                        {<<"temp_int">>, 5, 8},
-                                        {true, undefined}),
     QR2 = Folder2(),
     io:format("Query with additional entry length ~w~n", [length(QR2)]),
     true = lists:sort(QR2) == lists:sort([{5, B0, K0}|LoadedEntriesInRange]),
@@ -203,13 +197,11 @@ breaking_folds(_Config) ->
                     {max_journalsize, 10000000},
                     {sync_strategy, testutil:sync_strategy()}],
     {ok, Bookie1} = leveled_bookie:book_start(StartOpts1),
-    ObjectGen = testutil:get_compressiblevalue_andinteger(),
+    V1 = testutil:get_compressiblevalue_andinteger(),
     IndexGen = testutil:get_randomindexes_generator(8),
-    ObjL1 = testutil:generate_objects(KeyCount,
-                                        binary_uuid,
-                                        [],
-                                        ObjectGen,
-                                        IndexGen),
+    ObjL1 =
+        testutil:generate_objects(
+            KeyCount, binary_uuid, [], V1, IndexGen),
     testutil:riakload(Bookie1, ObjL1),
 
     % Find all keys index, and then same again but stop at a midpoint using a
@@ -284,10 +276,11 @@ breaking_folds(_Config) ->
             end
         end,
     {async, HeadFolderToMidK} = 
-        leveled_bookie:book_headfold(Bookie1,
-                                        ?RIAK_TAG, 
-                                        {FoldThrowFun(HeadFoldFun), []}, 
-                                        true, true, false),
+        leveled_bookie:book_headfold(
+            Bookie1,
+            ?RIAK_TAG, 
+            {FoldThrowFun(HeadFoldFun), []}, 
+            true, true, false),
     KeySizeList2 = lists:reverse(CatchingFold(HeadFolderToMidK)),
     io:format("Head fold with result size ~w~n", [length(KeySizeList2)]),
     true = KeyCount div 2 == length(KeySizeList2),    
@@ -297,21 +290,19 @@ breaking_folds(_Config) ->
             [{K,byte_size(V)}|Acc]
         end,
     {async, ObjectFolderKO} = 
-        leveled_bookie:book_objectfold(Bookie1,
-                                        ?RIAK_TAG,
-                                        {ObjFoldFun, []},
-                                        false,
-                                        key_order),
+        leveled_bookie:book_objectfold(
+            Bookie1, ?RIAK_TAG, {ObjFoldFun, []}, false, key_order),
     ObjSizeList1 = lists:reverse(ObjectFolderKO()),
     io:format("Obj fold with result size ~w~n", [length(ObjSizeList1)]),
     true = KeyCount == length(ObjSizeList1),
 
     {async, ObjFolderToMidK} = 
-        leveled_bookie:book_objectfold(Bookie1,
-                                        ?RIAK_TAG, 
-                                        {FoldThrowFun(ObjFoldFun), []}, 
-                                        false,
-                                        key_order),
+        leveled_bookie:book_objectfold(
+            Bookie1,
+            ?RIAK_TAG, 
+            {FoldThrowFun(ObjFoldFun), []},
+            false,
+            key_order),
     ObjSizeList2 = lists:reverse(CatchingFold(ObjFolderToMidK)),
     io:format("Object fold with result size ~w~n", [length(ObjSizeList2)]),
     true = KeyCount div 2 == length(ObjSizeList2),  
@@ -321,11 +312,8 @@ breaking_folds(_Config) ->
     % that was terminated by reaching a point in the key range .. as results
     % will not be passed to the fold function in key order
     {async, ObjectFolderSO} = 
-        leveled_bookie:book_objectfold(Bookie1,
-                                        ?RIAK_TAG,
-                                        {ObjFoldFun, []},
-                                        false,
-                                        sqn_order),
+        leveled_bookie:book_objectfold(
+            Bookie1, ?RIAK_TAG, {ObjFoldFun, []}, false, sqn_order),
     ObjSizeList1_SO = lists:reverse(ObjectFolderSO()),
     io:format("Obj fold with result size ~w~n", [length(ObjSizeList1_SO)]),
     true = KeyCount == length(ObjSizeList1_SO),
@@ -343,33 +331,26 @@ breaking_folds(_Config) ->
             end
         end,
     {async, ObjFolderTo1K} = 
-        leveled_bookie:book_objectfold(Bookie1,
-                                        ?RIAK_TAG, 
-                                        {FoldThrowThousandFun(ObjFoldFun), []}, 
-                                        false,
-                                        sqn_order),
+        leveled_bookie:book_objectfold(
+            Bookie1,
+            ?RIAK_TAG, 
+            {FoldThrowThousandFun(ObjFoldFun), []}, 
+            false,
+            sqn_order),
     ObjSizeList2_SO = lists:reverse(CatchingFold(ObjFolderTo1K)),
     io:format("Object fold with result size ~w~n", [length(ObjSizeList2_SO)]),
     true = 1000 == length(ObjSizeList2_SO),
 
-    ObjL2 = testutil:generate_objects(10,
-                                        binary_uuid,
-                                        [],
-                                        ObjectGen,
-                                        IndexGen,
-                                        "B2"),
-    ObjL3 = testutil:generate_objects(10,
-                                        binary_uuid,
-                                        [],
-                                        ObjectGen,
-                                        IndexGen,
-                                        "B3"),
-    ObjL4 = testutil:generate_objects(10,
-                                        binary_uuid,
-                                        [],
-                                        ObjectGen,
-                                        IndexGen,
-                                        "B4"),
+    ObjectGen = testutil:get_compressiblevalue_andinteger(),
+    ObjL2 =
+        testutil:generate_objects(
+            10, binary_uuid, [], ObjectGen, IndexGen, "B2"),
+    ObjL3 =
+        testutil:generate_objects(
+            10, binary_uuid, [], ObjectGen, IndexGen, "B3"),
+    ObjL4 =
+        testutil:generate_objects(
+            10, binary_uuid, [], ObjectGen, IndexGen, "B4"),
     testutil:riakload(Bookie1, ObjL2),
     testutil:riakload(Bookie1, ObjL3),
     testutil:riakload(Bookie1, ObjL4),
@@ -393,15 +374,12 @@ breaking_folds(_Config) ->
         end,
     
     {async, StopAt3BucketFolder} = 
-        leveled_bookie:book_bucketlist(Bookie1,
-                                        ?RIAK_TAG,
-                                        {StopAt3Fun, []},
-                                        all),
+        leveled_bookie:book_bucketlist(
+            Bookie1, ?RIAK_TAG, {StopAt3Fun, []}, all),
     BucketListSA3 = lists:reverse(CatchingFold(StopAt3BucketFolder)),
     io:format("bucket list with result ~w~n", [BucketListSA3]),
     true = [<<"B2">>, <<"B3">>] == BucketListSA3,
 
-
     ok = leveled_bookie:book_close(Bookie1),
     testutil:reset_filestructure().
 
@@ -503,32 +481,29 @@ small_load_with2i(_Config) ->
     true = 1 == length(KeyList2),
     
     %% Delete the objects from the ChkList removing the indexes
-    lists:foreach(fun({_RN, Obj, Spc}) ->
-                        DSpc = lists:map(fun({add, F, T}) -> 
-                                                {remove, F, T}
-                                            end,
-                                            Spc),
-                        {B, K} =
-                            {testutil:get_bucket(Obj), testutil:get_key(Obj)},
-                        testutil:book_riakdelete(Bookie1, B, K, DSpc)
-                        end,
-                    ChkList1),
+    lists:foreach(
+        fun({_RN, Obj, Spc}) ->
+            DSpc = lists:map(fun({add, F, T}) -> 
+                                    {remove, F, T}
+                                end,
+                                Spc),
+            {B, K} =
+                {testutil:get_bucket(Obj), testutil:get_key(Obj)},
+            testutil:book_riakdelete(Bookie1, B, K, DSpc)
+        end,
+        ChkList1),
     %% Get the Buckets Keys and Hashes for the whole bucket
-    FoldObjectsFun = fun(B, K, V, Acc) -> [{B, K, erlang:phash2(V)}|Acc]
-                                            end,
+    FoldObjectsFun =
+        fun(B, K, V, Acc) -> [{B, K, erlang:phash2(V)}|Acc] end,
 
-    {async, HTreeF1} = leveled_bookie:book_objectfold(Bookie1,
-                                                      ?RIAK_TAG,
-                                                      {FoldObjectsFun, []},
-                                                      false),
+    {async, HTreeF1} =
+        leveled_bookie:book_objectfold(
+            Bookie1, ?RIAK_TAG, {FoldObjectsFun, []}, false),
 
     KeyHashList1 = HTreeF1(),
-    {async, HTreeF2} = leveled_bookie:book_objectfold(Bookie1,
-                                                      ?RIAK_TAG,
-                                                      "Bucket",
-                                                      all,
-                                                      {FoldObjectsFun, []},
-                                                      false),
+    {async, HTreeF2} =
+        leveled_bookie:book_objectfold(
+            Bookie1, ?RIAK_TAG, "Bucket", all, {FoldObjectsFun, []}, false),
     KeyHashList2 = HTreeF2(),
     {async, HTreeF3} =
         leveled_bookie:book_objectfold(
@@ -543,10 +518,11 @@ small_load_with2i(_Config) ->
     true = 9900 == length(KeyHashList2),
     true = 9900 == length(KeyHashList3),
     
-    SumIntFun = fun(_B, _K, Obj, Acc) ->
-                        {I, _Bin} = testutil:get_value(Obj),
-                        Acc + I
-                        end,
+    SumIntFun =
+        fun(_B, _K, Obj, Acc) ->
+            {I, _Bin} = testutil:get_value(Obj),
+            Acc + I
+        end,
     BucketObjQ = 
         {foldobjects_bybucket, ?RIAK_TAG, "Bucket", all, {SumIntFun, 0}, true},
     {async, Sum1} = leveled_bookie:book_returnfolder(Bookie1, BucketObjQ),
@@ -561,8 +537,9 @@ small_load_with2i(_Config) ->
         lists:foldl(SumFromObjLFun, 0, ObjL1),
     ChkList1Total = 
         lists:foldl(SumFromObjLFun, 0, ChkList1),
-    io:format("Total in original object list ~w and from removed list ~w~n", 
-                [ObjL1Total, ChkList1Total]),
+    io:format(
+        "Total in original object list ~w and from removed list ~w~n", 
+        [ObjL1Total, ChkList1Total]),
 
     Total1 = ObjL1Total - ChkList1Total, 
     
@@ -600,7 +577,7 @@ query_count(_Config) ->
     testutil:check_forobject(Book1, TestObject),
     lists:foreach(
         fun(_X) ->
-            V = testutil:get_compressiblevalue(),
+            V = <<"TestValue">>,
             Indexes = testutil:get_randomindexes_generator(8),
             SW = os:timestamp(),
             ObjL1 =
@@ -826,10 +803,9 @@ query_count(_Config) ->
     
     ok = leveled_bookie:book_close(Book4),
     
-    {ok, Book5} = leveled_bookie:book_start(RootPath,
-                                            2000,
-                                            50000000,
-                                            testutil:sync_strategy()),
+    {ok, Book5} =
+        leveled_bookie:book_start(
+            RootPath, 2000, 50000000, testutil:sync_strategy()),
     {async, BLF3} = leveled_bookie:book_returnfolder(Book5, BucketListQuery),
     SW_QC = os:timestamp(),
     BucketSet3 = BLF3(),
@@ -849,14 +825,14 @@ capture_and_filter_terms(_Config) ->
     {ok, Book1} =
         leveled_bookie:book_start(
             RootPath, 2000, 50000000, testutil:sync_strategy()),
-    ObjectGen = testutil:get_compressiblevalue_andinteger(),
+    V1 = <<"V1">>,
     IndexGen =
         fun() ->
             [{add, IdxName, list_to_binary(perf_SUITE:random_people_index())}]
         end,
     ObjL1 =
         testutil:generate_objects(
-            80000, uuid, [], ObjectGen, IndexGen, Bucket),
+            100000, uuid, [], V1, IndexGen, Bucket),
     testutil:riakload(Book1, ObjL1),
 
     StartDoB = <<"19740301">>,
@@ -1002,71 +978,83 @@ capture_and_filter_terms(_Config) ->
             end,
             RunnerRE2_5()),
 
-    SW6 = os:timestamp(),
-
-    ComparatorExpression3 =
-        "(\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\") andalso (\"$dod\" > \"19000101\" orelse \"$dod\" < \"20501231\").",
+    SW8 = os:timestamp(),
+    
+    FilterExpression1 = "($dob BETWEEN \"19740301\" AND \"19761030\")",
+    {ok, FilterExpressionParsed1} =
+        leveled_filter:generate_filter_expression(
+            FilterExpression1, maps:new()),
+    FilterFun5 =
+        fun(AttrMap) ->
+            leveled_filter:apply_filter(FilterExpressionParsed1, AttrMap)
+        end,
         
-    FilterFun3 =
-        leveled_filter:validate_comparison_expression(
-            ComparatorExpression3, ["dob", "dod"]),
-    QueryRE2_6 =
+    QueryRE2_8 =
         {index_query,
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {false,
                 {capture,
-                    element(2, re2:compile(WillowLeedsDoubleExtractor)),
-                    [dob, dod],
-                    FilterFun3}}
+                    element(2, re2:compile(WillowLeedsExtractor)),
+                    [<<"dob">>],
+                    FilterFun5}}
     },
-    {async, RunnerRE2_6} = leveled_bookie:book_returnfolder(Book1, QueryRE2_6),
-    BornMid70sRE2_6 = RunnerRE2_6(),
+    {async, RunnerRE2_8} = leveled_bookie:book_returnfolder(Book1, QueryRE2_8),
+    BornMid70sRE2_8 = RunnerRE2_8(),
 
-    SW7 = os:timestamp(),
-    
-    ComparatorExpression4 =
-        "\"$dob\" >= \"19740301\" andalso \"$dob\" =< \"19761030\".",
-        
-    FilterFun4 =
-        leveled_filter:validate_comparison_expression(
-            ComparatorExpression4, ["dob"]),
-    QueryRE2_7 =
+    SW9 = os:timestamp(),
+
+    PreFilterRE =
+        "[^\\|]*\\|(?P<dob>197[4-6]{1}[0-9]{4})\\|"
+        "[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|"
+        "[^\\|]*#LS[^\\|]*",
+    QueryRE2_9 =
         {index_query,
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {false,
                 {capture,
-                    element(2, re2:compile(WillowLeedsExtractor)),
-                    [dob],
-                    FilterFun4}}
+                    element(2, re2:compile(PreFilterRE)),
+                    [<<"dob">>],
+                    FilterFun5}}
     },
-    {async, RunnerRE2_7} = leveled_bookie:book_returnfolder(Book1, QueryRE2_7),
-    BornMid70sRE2_7 = RunnerRE2_7(),
+    {async, RunnerRE2_9} = leveled_bookie:book_returnfolder(Book1, QueryRE2_9),
+    BornMid70sRE2_9 = RunnerRE2_9(),
 
-    SW8 = os:timestamp(),
+    SW10 = os:timestamp(),
 
-    PreFilterRE =
-        "[^\\|]*\\|(?P<dob>197[4-6]{1}[0-9]{4})\\|"
-        "[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|"
+    WillowLeedsExtractor = 
+        "[^\\|]*\\|(?P<dob>[0-9]{8})\\|[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|"
         "[^\\|]*#LS[^\\|]*",
-    QueryRE2_8 =
+
+    FilterExpression2 =
+        "($dob BETWEEN \"19740301\" AND \"19761030\")"
+        "AND (contains($gns, \"#Willow\") AND contains($pcs, \"#LS\"))",
+    {ok, FilterExpressionParsed2} =
+        leveled_filter:generate_filter_expression(
+            FilterExpression2, maps:new()),
+    FilterFun6 =
+        fun(AttrMap) ->
+            leveled_filter:apply_filter(FilterExpressionParsed2, AttrMap)
+        end,
+        
+    QueryRE2_10 =
         {index_query,
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {false,
-                {capture,
-                    element(2, re2:compile(PreFilterRE)),
-                    [dob],
-                    FilterFun4}}
+                {delimited,
+                    "|",
+                    [<<"surname">>, <<"dob">>, <<"dod">>, <<"gns">>, <<"pcs">>],
+                    FilterFun6}}
     },
-    {async, RunnerRE2_8} = leveled_bookie:book_returnfolder(Book1, QueryRE2_8),
-    BornMid70sRE2_8 = RunnerRE2_8(),
+    {async, RunnerRE2_10} = leveled_bookie:book_returnfolder(Book1, QueryRE2_10),
+    BornMid70sRE2_10 = RunnerRE2_10(),
 
-    SW9 = os:timestamp(),
+    SW11 = os:timestamp(),
 
     true = length(BornMid70s0) > 0,
 
@@ -1075,47 +1063,46 @@ capture_and_filter_terms(_Config) ->
     true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_3),
     true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_4),
     true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_5),
-    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_6),
-    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_7),
     true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_8),
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_9),
+    true = lists:sort(BornMid70s0) == lists:sort(BornMid70sRE2_10),
 
-    io:format(
-        % user,
+    maybe_log_toscreen(
         "~nFilter outside took ~w ms~n",
         [timer:now_diff(SW1, SW0) div 1000]),
-    io:format(
-        % user,
+    maybe_log_toscreen(
         "~nPCRE Capture filter inside took ~w ms~n",
         [timer:now_diff(SW2, SW1) div 1000]),
-    io:format(
-        % user,
+    maybe_log_toscreen(
         "~nRE2 Capture filter inside took ~w ms~n",
         [timer:now_diff(SW3, SW2) div 1000]),
-    io:format(
-        % user,
+    maybe_log_toscreen(
         "~nRE2 Capture filter outside took ~w ms~n",
         [timer:now_diff(SW4, SW3) div 1000]),
-    io:format(
-        % user,
+    maybe_log_toscreen(
         "~nRE2 double-capture filter outside took ~w ms~n",
         [timer:now_diff(SW5, SW4) div 1000]),
-    io:format(
-        % user,
-        "~nRE2 double-capture filter with parsed query string took ~w ms~n",
-        [timer:now_diff(SW7, SW6) div 1000]),
-    io:format(
-        % user,
-        "~nRE2 single-capture filter with parsed query string took ~w ms~n",
-        [timer:now_diff(SW8, SW7) div 1000]),
-    io:format(
-        % user,
-        "~nRE2 single-capture pre-filter with parsed query string took ~w ms~n",
+    maybe_log_toscreen(
+        "~nRE2 single-capture filter with parsed filter expression took ~w ms~n",
         [timer:now_diff(SW9, SW8) div 1000]),
+    maybe_log_toscreen(
+        "~nRE2 single-capture pre-filter with parsed query string took ~w ms~n",
+        [timer:now_diff(SW10, SW9) div 1000]),
+    maybe_log_toscreen(
+        "~nDelimited index with parsed filter expression took ~w ms~n",
+        [timer:now_diff(SW11, SW10) div 1000]),
 
     ok = leveled_bookie:book_close(Book1),
     
     testutil:reset_filestructure().
 
+maybe_log_toscreen(Log, Subs) ->
+    io:format(
+        % user,
+        Log,
+        Subs
+    ).
+
 
 count_termsonindex(Bucket, IdxField, Book, QType) ->
     lists:foldl(
@@ -1142,39 +1129,30 @@ count_termsonindex(Bucket, IdxField, Book, QType) ->
 
 multibucket_fold(_Config) ->
     RootPath = testutil:reset_filestructure(),
-    {ok, Bookie1} = leveled_bookie:book_start(RootPath,
-                                            2000,
-                                            50000000,
-                                            testutil:sync_strategy()),
-    ObjectGen = testutil:get_compressiblevalue_andinteger(),
+    {ok, Bookie1} =
+        leveled_bookie:book_start(
+            RootPath, 2000, 50000000, testutil:sync_strategy()),
+    ObjectGen = <<"V1">>,
     IndexGen = fun() -> [] end,
-    ObjL1 = testutil:generate_objects(13000,
-                                        uuid,
-                                        [],
-                                        ObjectGen,
-                                        IndexGen,
-                                        {<<"Type1">>, <<"Bucket1">>}),
+    B1 = {<<"Type1">>, <<"Bucket1">>},
+    B2 = <<"Bucket2">>,
+    B3 = <<"Bucket3">>,
+    B4 = {<<"Type2">>, <<"Bucket4">>},
+    ObjL1 =
+        testutil:generate_objects(
+            13000, uuid, [], ObjectGen, IndexGen, B1),
     testutil:riakload(Bookie1, ObjL1),
-    ObjL2 = testutil:generate_objects(17000,
-                                        uuid,
-                                        [],
-                                        ObjectGen,
-                                        IndexGen,
-                                        <<"Bucket2">>),
+    ObjL2 =
+        testutil:generate_objects(
+            17000, uuid, [], ObjectGen, IndexGen, B2),
     testutil:riakload(Bookie1, ObjL2),
-    ObjL3 = testutil:generate_objects(7000,
-                                        uuid,
-                                        [],
-                                        ObjectGen,
-                                        IndexGen,
-                                        <<"Bucket3">>),
+    ObjL3 =
+        testutil:generate_objects(
+            7000, uuid, [], ObjectGen, IndexGen, B3),
     testutil:riakload(Bookie1, ObjL3),
-    ObjL4 = testutil:generate_objects(23000,
-                                        uuid,
-                                        [],
-                                        ObjectGen,
-                                        IndexGen,
-                                        {<<"Type2">>, <<"Bucket4">>}),
+    ObjL4 =
+        testutil:generate_objects(
+            23000, uuid, [], ObjectGen, IndexGen, B4),
     testutil:riakload(Bookie1, ObjL4),
 
     FF = fun(B, K, _PO, Acc) ->
@@ -1241,44 +1219,37 @@ rotating_objects(_Config) ->
 
 foldobjects_bybucket_range(_Config) ->
     RootPath = testutil:reset_filestructure(),
-    {ok, Bookie1} = leveled_bookie:book_start(RootPath,
-                                              2000,
-                                              50000000,
-                                              testutil:sync_strategy()),
+    {ok, Bookie1} =
+        leveled_bookie:book_start(
+            RootPath, 2000, 50000000, testutil:sync_strategy()),
     ObjectGen = testutil:get_compressiblevalue_andinteger(),
     IndexGen = fun() -> [] end,
-    ObjL1 = testutil:generate_objects(1300,
-                                      {fixed_binary, 1},
-                                      [],
-                                      ObjectGen,
-                                      IndexGen,
-                                      <<"Bucket1">>),
+    ObjL1 =
+        testutil:generate_objects(
+            1300, {fixed_binary, 1}, [], ObjectGen, IndexGen, <<"Bucket1">>),
     testutil:riakload(Bookie1, ObjL1),
 
-    FoldKeysFun = fun(_B, K,_V, Acc) ->
-                          [ K |Acc]
-                  end,
+    FoldKeysFun =
+        fun(_B, K,_V, Acc) -> [ K |Acc] end,
 
     StartKey = testutil:fixed_bin_key(123),
     EndKey = testutil:fixed_bin_key(779),
 
-    {async, Folder} = leveled_bookie:book_objectfold(Bookie1,
-                                                     ?RIAK_TAG,
-                                                     <<"Bucket1">>,
-                                                     {StartKey, EndKey}, {FoldKeysFun, []},
-                                                     true
-                                                    ),
+    {async, Folder} =
+        leveled_bookie:book_objectfold(
+            Bookie1,
+            ?RIAK_TAG,
+            <<"Bucket1">>,
+            {StartKey, EndKey}, {FoldKeysFun, []},
+            true
+        ),
     ResLen = length(Folder()),
     io:format("Length of Result of folder ~w~n", [ResLen]),
     true = 657 == ResLen,
 
-    {async, AllFolder} = leveled_bookie:book_objectfold(Bookie1,
-                                                        ?RIAK_TAG,
-                                                        <<"Bucket1">>,
-                                                        all,
-                                                        {FoldKeysFun, []},
-                                                        true
-                                                       ),
+    {async, AllFolder} =
+        leveled_bookie:book_objectfold(
+            Bookie1, ?RIAK_TAG, <<"Bucket1">>, all, {FoldKeysFun, []}, true),
 
     AllResLen = length(AllFolder()),
     io:format("Length of Result of all keys folder ~w~n", [AllResLen]),
diff --git a/test/end_to_end/perf_SUITE.erl b/test/end_to_end/perf_SUITE.erl
index 27549f54..a4606292 100644
--- a/test/end_to_end/perf_SUITE.erl
+++ b/test/end_to_end/perf_SUITE.erl
@@ -11,6 +11,12 @@
 -define(MINI_QUERY_DIVISOR, 8).
 -define(RGEX_QUERY_DIVISOR, 32).
 
+-ifdef(test_filter_expression).
+    -define(TEST_FE, true).
+-else.
+    -define(TEST_FE, false).
+-endif.
+
 -ifndef(performance).
   -define(performance, riak_ctperf).
 -endif.
@@ -564,8 +570,53 @@ random_queries(Bookie, Bucket, IDs, IdxCnt, MaxRange, IndexesReturned) ->
     ),
     TC div 1000.
 
-
 random_people_queries(Bookie, Bucket, IndexesReturned) ->
+    random_people_queries(?TEST_FE, Bookie, Bucket, IndexesReturned).
+
+random_people_queries(true, Bookie, Bucket, IndexesReturned) ->
+    FilterExpression =
+        "($dob BETWEEN \"19700101\" AND \"19791231\") "
+        "AND (contains($gns, \"#Willow\") AND contains($pcs, \"#LS\"))",
+    {ok, ParsedFilter} =
+        leveled_filter:generate_filter_expression(
+            FilterExpression, maps:new()),
+    FilterFun =
+        fun(AttrMap) -> leveled_filter:apply_filter(ParsedFilter, AttrMap) end,
+    QueryFun =
+        fun() ->
+            Surname = get_random_surname(),
+            Range =
+                {?PEOPLE_INDEX,
+                    Surname,
+                    <<Surname/binary, 126:8/integer>>
+            },
+            FoldKeysFun =  fun(_B, _K, Cnt) -> Cnt + 1 end,
+            {async, R} =
+                leveled_bookie:book_indexfold(
+                    Bookie,
+                    {Bucket, <<>>}, 
+                    {FoldKeysFun, 0},
+                    Range,
+                    {true,
+                        {delimited,
+                            "|",
+                            [<<"surname">>, <<"dob">>, <<"dod">>, <<"gns">>, <<"pcs">>],
+                            FilterFun
+                        }
+                    }),
+            R()
+        end,
+    
+    {TC, {QC, EF}} =
+        timer:tc(fun() -> run_queries(QueryFun, 0, 0, IndexesReturned) end),
+    ct:log(
+        ?INFO,
+        "Fetch of ~w index entries by regex in ~w queries took ~w ms"
+        " with filter_expression=~w",
+        [EF, QC, TC div 1000, true]
+    ),
+    TC div 1000;
+random_people_queries(false, Bookie, Bucket, IndexesReturned) ->
     SeventiesWillowRegex =
         "[^\\|]*\\|197[0-9]{5}\\|[^\\|]*\\|"
         "[^\\|]*#Willow[^\\|]*\\|[^\\|]*#LS[^\\|]*",
@@ -578,8 +629,7 @@ random_people_queries(Bookie, Bucket, IndexesReturned) ->
                     Surname,
                     <<Surname/binary, 126:8/integer>>
             },
-            {ok, TermRegex} =
-                leveled_util:regex_compile(SeventiesWillowRegex),
+            {ok, TermRegex} = leveled_util:regex_compile(SeventiesWillowRegex),
             FoldKeysFun =  fun(_B, _K, Cnt) -> Cnt + 1 end,
             {async, R} =
                 leveled_bookie:book_indexfold(
@@ -595,8 +645,9 @@ random_people_queries(Bookie, Bucket, IndexesReturned) ->
         timer:tc(fun() -> run_queries(QueryFun, 0, 0, IndexesReturned) end),
     ct:log(
         ?INFO,
-        "Fetch of ~w index entries by regex in ~w queries took ~w ms",
-        [EF, QC, TC div 1000]
+        "Fetch of ~w index entries by regex in ~w queries took ~w ms"
+        " with filter_expression=~w",
+        [EF, QC, TC div 1000, false]
     ),
     TC div 1000.
 

From bb7c88942c5cfad520b9e011611e1c5afbe5a32f Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Mon, 29 Apr 2024 21:10:07 +0100
Subject: [PATCH 5/8] Initial lexer/parser for eval pipeline

Allow for the generation of projected attributes from an index term
---
 rebar.config                 |   2 +-
 src/leveled_eval.erl         | 240 +++++++++++++++++++++++++++++++++++
 src/leveled_evallexer.xrl    |  33 +++++
 src/leveled_evalparser.yrl   |  39 ++++++
 src/leveled_filter.erl       |  32 ++++-
 src/leveled_filterlexer.xrl  |  10 +-
 src/leveled_filterparser.yrl |  17 +--
 7 files changed, 357 insertions(+), 16 deletions(-)
 create mode 100644 src/leveled_eval.erl
 create mode 100644 src/leveled_evallexer.xrl
 create mode 100644 src/leveled_evalparser.yrl

diff --git a/rebar.config b/rebar.config
index f2b6ff10..e76eb20d 100644
--- a/rebar.config
+++ b/rebar.config
@@ -7,7 +7,7 @@
     deprecated_function_calls, deprecated_functions]}.
 
 {cover_excl_mods,
-  [leveled_filterlexer, leveled_filterparser,
+  [leveled_filterlexer, leveled_filterparser, leveled_evallexer, leveled_evalparser,
     testutil,
     appdefined_SUITE, basic_SUITE, iterator_SUITE,
     perf_SUITE, recovery_SUITE, riak_SUITE, tictac_SUITE]}.
diff --git a/src/leveled_eval.erl b/src/leveled_eval.erl
new file mode 100644
index 00000000..4a6619f5
--- /dev/null
+++ b/src/leveled_eval.erl
@@ -0,0 +1,240 @@
+%% -------- Eval Functions ---------
+%%
+%% Support for different eval expressions within leveled 
+%%
+
+-module(leveled_eval).
+
+-export([apply_eval/4]).
+
+%%%============================================================================
+%%% External API
+%%%============================================================================
+
+apply_eval({eval, Eval}, Term, Key, AttrMap) ->
+    apply_eval(Eval, Term, Key, AttrMap);
+apply_eval({'PIPE', Eval1, 'INTO', Eval2}, Term, Key, AttrMap) ->
+    apply_eval(Eval2, Term, Key, apply_eval(Eval1, Term, Key, AttrMap));
+apply_eval({
+        delim, {identifier, _, InKey}, {string, _, Delim}, ExpKeys},
+        Term, Key, AttrMap) ->
+    TermToSplit = term_to_process(InKey, Term, Key, AttrMap),
+    CptTerms = string:split(TermToSplit, Delim, all),
+    L = min(length(CptTerms), length(ExpKeys)),
+    maps:merge(
+        AttrMap,
+        maps:from_list(
+            lists:zip(lists:sublist(ExpKeys, L), lists:sublist(CptTerms, L)))
+    );
+apply_eval(
+        {join, InKeys, {string, _, Delim}, {identifier, _, OutKey}},
+        _Term, _Key, AttrMap) ->
+    NewTerm =
+        unicode:characters_to_binary(
+            lists:join(
+                Delim,
+                lists:map(
+                    fun(InKey) -> maps:get(InKey, AttrMap, <<"">>) end,
+                    InKeys)
+            )
+        ),
+    maps:put(OutKey, NewTerm, AttrMap);
+apply_eval({
+        split, {identifier, _, InKey}, {string, _, Delim}, {identifier, _, OutKey}},
+        Term, Key, AttrMap) ->
+    TermToSplit = term_to_process(InKey, Term, Key, AttrMap),
+    TermList = string:split(TermToSplit, Delim, all),
+    maps:put(OutKey, TermList, AttrMap);
+apply_eval(
+        {slice, {identifier, _, InKey}, {integer, _, Width}, {identifier, _, OutKey}},
+        Term, Key, AttrMap) ->
+    TermToSlice = term_to_process(InKey, Term, Key, AttrMap),
+    TermCount = string:length(TermToSlice) div Width,
+    TermList =
+        lists:map(
+            fun(S) -> string:slice(TermToSlice, S, Width) end,
+            lists:map(fun(I) -> Width * I end, lists:seq(0, TermCount - 1))),
+    maps:put(OutKey, TermList, AttrMap);
+apply_eval(
+        {index, {identifier, _, InKey},
+            {integer, _, Start}, {integer, _, Length},
+            {identifier, _, OutKey}},
+        Term, Key, AttrMap) ->
+    TermToIndex = term_to_process(InKey, Term, Key, AttrMap),
+    case string:length(TermToIndex) of
+        L when L >= (Start + Length) ->
+            maps:put(
+                OutKey, string:slice(TermToIndex, Start, Length), AttrMap);
+        _ ->
+            AttrMap
+    end;
+apply_eval(
+        {to_integer, {identifier, _, InKey}, {identifier, _, OutKey}},
+        Term, Key, AttrMap) ->
+    TermToConvert = term_to_process(InKey, Term, Key, AttrMap),
+    case string:to_integer(TermToConvert) of
+        {I, _Rest} when is_integer(I) ->
+            maps:put(OutKey, I, AttrMap);
+        _ ->
+            AttrMap
+    end.
+
+%%%============================================================================
+%%% Internal functions
+%%%============================================================================
+
+term_to_process(<<"term">>, Term, _Key, _AttrMap) ->
+    Term;
+term_to_process(<<"key">>, _Term, Key, _AttrMap) ->
+    Key;
+term_to_process(AttrKey, _Term, _Key, AttrMap) ->
+    maps:get(AttrKey, AttrMap, <<"">>).
+
+%%%============================================================================
+%%% Test
+%%%============================================================================
+
+-ifdef(TEST).
+
+-include_lib("eunit/include/eunit.hrl").
+
+basic_test() ->
+    EvalString1 = "delim($term, \"|\", ($fn, $dob, $dod, $gns, $pcs))",
+    EvalString2 = "delim($gns, \"#\", ($gn1, $gn2, $gn3))",
+    
+    EvalString3 = EvalString1 ++ " | " ++ EvalString2,
+    {ok, Tokens3, _EndLine3} = leveled_evallexer:string(EvalString3),
+    {ok, ParsedExp3} = leveled_evalparser:parse(Tokens3),
+    EvalOut3 =
+        apply_eval(
+            ParsedExp3,
+            <<"SMITH|19861216||Willow#Mia|LS1 4BT#LS8 1ZZ">>,
+            <<"9000000001">>,
+            maps:new()
+        ),
+    ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut3)),
+    ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut3)),
+    ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut3)),
+    ?assertMatch(<<"Willow#Mia">>, maps:get(<<"gns">>, EvalOut3)),
+    ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut3)),
+    ?assertMatch(<<"Willow">>, maps:get(<<"gn1">>, EvalOut3)),
+    ?assertMatch(<<"Mia">>, maps:get(<<"gn2">>, EvalOut3)),
+    ?assertNot(maps:is_key(<<"gn3">>, EvalOut3)),
+
+
+    EvalString4 = EvalString3 ++ " | join(($dob, $fn), \"|\", $dobfn)",
+    {ok, Tokens4, _EndLine4} = leveled_evallexer:string(EvalString4),
+    {ok, ParsedExp4} = leveled_evalparser:parse(Tokens4),
+    EvalOut4 =
+        apply_eval(
+            ParsedExp4,
+            <<"SMITH|19861216||Willow#Mia|LS1 4BT#LS8 1ZZ">>,
+            <<"9000000001">>,
+            maps:new()
+        ),
+    ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut4)),
+    ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut4)),
+    ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut4)),
+    ?assertMatch(<<"Willow#Mia">>, maps:get(<<"gns">>, EvalOut4)),
+    ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut4)),
+    ?assertMatch(<<"Willow">>, maps:get(<<"gn1">>, EvalOut4)),
+    ?assertMatch(<<"Mia">>, maps:get(<<"gn2">>, EvalOut4)),
+    ?assertNot(maps:is_key(<<"gn3">>, EvalOut4)),
+    ?assertMatch(<<"19861216|SMITH">>, maps:get(<<"dobfn">>, EvalOut4)),
+
+
+    EvalString5 = EvalString4 ++ " | index($dob, 0, 4, $yob) | to_integer($yob, $yob)",
+    {ok, Tokens5, _EndLine5} = leveled_evallexer:string(EvalString5),
+    {ok, ParsedExp5} = leveled_evalparser:parse(Tokens5),
+    EvalOut5 =
+        apply_eval(
+            ParsedExp5,
+            <<"SMITH|19861216||Willow#Mia|LS1 4BT#LS8 1ZZ">>,
+            <<"9000000001">>,
+            maps:new()
+        ),
+    ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut5)),
+    ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut5)),
+    ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut5)),
+    ?assertMatch(<<"Willow#Mia">>, maps:get(<<"gns">>, EvalOut5)),
+    ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut5)),
+    ?assertMatch(<<"Willow">>, maps:get(<<"gn1">>, EvalOut5)),
+    ?assertMatch(<<"Mia">>, maps:get(<<"gn2">>, EvalOut5)),
+    ?assertNot(maps:is_key(<<"gn3">>, EvalOut5)),
+    ?assertMatch(<<"19861216|SMITH">>, maps:get(<<"dobfn">>, EvalOut5)),
+    ?assertMatch(1986, maps:get(<<"yob">>, EvalOut5)),
+
+    EvalString6 = EvalString1 ++ " | slice($gns, 2, $gns)",
+    {ok, Tokens6, _EndLine6} = leveled_evallexer:string(EvalString6),
+    {ok, ParsedExp6} = leveled_evalparser:parse(Tokens6),
+    EvalOut6 =
+        apply_eval(
+            ParsedExp6,
+            <<"SMITH|19861216||MAN1Ve|LS1 4BT#LS8 1ZZ">>,
+            <<"9000000001">>,
+            maps:new()
+        ),
+    ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut6)),
+    ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut6)),
+    ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut6)),
+    ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut6)),
+    ?assertMatch([<<"MA">>, <<"N1">>, <<"Ve">>], maps:get(<<"gns">>, EvalOut6)),
+
+    EvalOut7 =
+        apply_eval(
+            ParsedExp6,
+            <<"SMITH|19861216||MAN1VeZ|LS1 4BT#LS8 1ZZ">>,
+            <<"9000000001">>,
+            maps:new()
+        ),
+    ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut7)),
+    ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut7)),
+    ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut7)),
+    ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut7)),
+    ?assertMatch([<<"MA">>, <<"N1">>, <<"Ve">>], maps:get(<<"gns">>, EvalOut7)),
+
+    EvalString8 = EvalString1 ++ " | split($gns, \"#\", $gns)",
+    {ok, Tokens8, _EndLine8} = leveled_evallexer:string(EvalString8),
+    {ok, ParsedExp8} = leveled_evalparser:parse(Tokens8),
+    EvalOut8 =
+        apply_eval(
+            ParsedExp8,
+            <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>,
+            <<"9000000001">>,
+            maps:new()
+        ),
+    ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut8)),
+    ?assertMatch(<<"19861216">>, maps:get(<<"dob">>, EvalOut8)),
+    ?assertMatch(<<"">>, maps:get(<<"dod">>, EvalOut8)),
+    ?assertMatch(<<"LS1 4BT#LS8 1ZZ">>, maps:get(<<"pcs">>, EvalOut8)),
+    ?assertMatch([<<"Willow">>, <<"Mia">>, <<"Vera">>], maps:get(<<"gns">>, EvalOut8)),
+
+    EvalString9 =
+        "delim($term, \"|\", ($name, $height, $weight, $pick)) |"
+        " to_integer($height, $height) |"
+        " to_integer($weight, $weight) |"
+        " to_integer($pick, $pick) |"
+        " delim($key, \"|\", ($team, $number)) |"
+        " index($team, 0, 9, $doh)",
+    {ok, Tokens9, _EndLine9} = leveled_evallexer:string(EvalString9),
+    {ok, ParsedExp9} = leveled_evalparser:parse(Tokens9),
+    EvalOut9 =
+        apply_eval(
+            ParsedExp9,
+            <<"WEMBANYAMA|224cm|95kg|#1">>,
+            <<"SPURS|00001">>,
+            maps:new()
+        ),
+    ?assertMatch(<<"WEMBANYAMA">>, maps:get(<<"name">>, EvalOut9)),
+    ?assertMatch(224, maps:get(<<"height">>, EvalOut9)),
+    ?assertMatch(95, maps:get(<<"weight">>, EvalOut9)),
+    ?assertMatch(<<"#1">>, maps:get(<<"pick">>, EvalOut9)),
+        % Not changes as not starting with integer
+    ?assertMatch(<<"SPURS">>, maps:get(<<"team">>, EvalOut9)),
+    ?assertMatch(<<"00001">>, maps:get(<<"number">>, EvalOut9)),
+    ?assertNot(maps:is_key(<<"doh">>, EvalOut9))
+    .
+
+
+
+-endif.
\ No newline at end of file
diff --git a/src/leveled_evallexer.xrl b/src/leveled_evallexer.xrl
new file mode 100644
index 00000000..29e65fbf
--- /dev/null
+++ b/src/leveled_evallexer.xrl
@@ -0,0 +1,33 @@
+%% Lexer for eval expressions
+
+Definitions.
+WhiteSpace  = ([\t\f\v\r\n\s]+)
+
+Rules.
+
+{WhiteSpace} : skip_token.
+
+\( : {token, {'(', TokenLine}}.
+\) : {token, {')', TokenLine}}.
+,  : {token, {',', TokenLine}}.
+\| : {token, {'PIPE', TokenLine}}.
+
+delim       : {token, {delim, TokenLine}}.
+join        : {token, {join, TokenLine}}.
+split       : {token, {split, TokenLine}}.
+slice       : {token, {slice, TokenLine}}.
+index       : {token, {index, TokenLine}}.
+to_integer  : {token, {to_integer, TokenLine}}.
+
+\$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}.
+\"[^"]*\"                : {token, {string, TokenLine, strip_string(TokenChars)}}. %"
+[0-9]+                   : {token, {integer, TokenLine, list_to_integer(TokenChars)}}.
+
+Erlang code.
+
+strip_string(TokenChars) ->
+    list_to_binary(lists:sublist(TokenChars, 2, length(TokenChars) -2)).
+
+strip_identifier(TokenChars) ->
+    [36|StrippedChars] = TokenChars,
+    list_to_binary(StrippedChars).
\ No newline at end of file
diff --git a/src/leveled_evalparser.yrl b/src/leveled_evalparser.yrl
new file mode 100644
index 00000000..43a1b2ab
--- /dev/null
+++ b/src/leveled_evalparser.yrl
@@ -0,0 +1,39 @@
+%% Grammar for eval expressions
+
+Nonterminals
+top_level eval identifiers identifier_list.
+
+Terminals
+'(' ')' ','
+identifier string integer
+'PIPE'
+delim join split slice index to_integer.
+
+Rootsymbol top_level.
+
+top_level -> eval: {eval, '$1'}.
+
+eval -> eval 'PIPE' eval                                                     : {'PIPE', '$1', 'INTO', '$3'}.
+eval -> delim '(' identifier ',' string ',' identifier_list ')'              : {delim, '$3', '$5', '$7'}.
+eval -> join '(' identifier_list ',' string ',' identifier ')'               : {join, '$3', '$5', '$7'}.
+eval -> split '(' identifier ',' string ',' identifier ')'                   : {split, '$3', '$5', '$7'}.
+eval -> slice '(' identifier ',' integer ',' identifier ')'                  : {slice, '$3', '$5', '$7'}.
+eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')'    : {index, '$3', '$5', '$7', '$9'}.
+eval -> to_integer '(' identifier ',' 'identifier' ')'                       : {to_integer, '$3', '$5'}.
+
+identifier_list -> '(' identifiers ')' : strip_ids('$2').
+
+identifiers -> identifier ',' identifiers : ['$1' | '$3'].
+identifiers -> identifier                 : ['$1'].
+
+Endsymbol '$end'.
+
+Right 100 'PIPE'.
+
+Erlang code.
+
+strip_ids(IDL) ->
+    lists:map(
+        fun(ID) -> element(3, ID) end,
+        lists:flatten(IDL)
+    ). 
\ No newline at end of file
diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl
index 43fb1573..0e9515fd 100644
--- a/src/leveled_filter.erl
+++ b/src/leveled_filter.erl
@@ -36,7 +36,14 @@ apply_filter({'BETWEEN', {identifier, _, ID}, CompA, CompB}, AttrMap) ->
                     V >= Low andalso V =< High
             end
     end;
-apply_filter({'IN', {identifier, _, ID}, CheckList}, AttrMap) ->
+apply_filter({'IN', {string, _, TestString}, {identifier, _, ID}}, AttrMap) ->
+    case maps:get(ID, AttrMap, notfound) of
+        CheckList when is_list(CheckList) ->
+            lists:member(TestString, CheckList);
+        _ ->
+            false
+    end;
+apply_filter({'IN', {identifier, _, ID}, CheckList}, AttrMap) when is_list(CheckList) ->
     case maps:get(ID, AttrMap, notfound) of
         notfound ->
             false;
@@ -233,6 +240,12 @@ filterexpression_test() ->
     {ok, Filter5A} = generate_filter_expression(FE5A, maps:new()),
     ?assert(apply_filter(Filter5A, M5)),
     ?assertNot(apply_filter(Filter5A, M6)),
+    FE5B =
+        "$dob >= \"19740301\" AND $dob <= \"19761030\""
+        " AND $dod = \"20221216\"",
+    {ok, Filter5B} = generate_filter_expression(FE5B, maps:new()),
+    ?assert(apply_filter(Filter5B, M5)),
+    ?assertNot(apply_filter(Filter5B, M6)),
 
     FE6 =
         "(contains($gn, \"MA\") OR $fn BETWEEN \"SM\" AND \"SN\")"
@@ -335,7 +348,22 @@ filterexpression_test() ->
     ?assertNot(
         apply_filter(
             Filter13,
-            #{<<"dob">> => <<"19440812">>, <<"dod">> => <<"20240213">>}))
+            #{<<"dob">> => <<"19440812">>, <<"dod">> => <<"20240213">>})),
+
+    FE14 = "\"M1\" IN $gns",
+    {ok, Filter14} = generate_filter_expression(FE14, maps:new()),
+    ?assert(
+        apply_filter(
+            Filter14,
+            #{<<"gns">> => [<<"MA">>, <<"M1">>, <<"A0">>]})),
+    ?assertNot(
+        apply_filter(
+            Filter14,
+            #{<<"gns">> => [<<"MA">>, <<"M2">>, <<"A0">>]})),
+    ?assertNot(
+        apply_filter(
+            Filter14,
+            #{<<"gns">> => <<"M1">>}))
     .
 
 -endif.
diff --git a/src/leveled_filterlexer.xrl b/src/leveled_filterlexer.xrl
index 9fea88ff..6d779f41 100644
--- a/src/leveled_filterlexer.xrl
+++ b/src/leveled_filterlexer.xrl
@@ -24,11 +24,11 @@ IN      : {token, {'IN', TokenLine}}.
 <=      : {token, {comparator, '<=', TokenLine}}.
 >=      : {token, {comparator, '>=', TokenLine}}.
 
-contains             : {token, {'contains', TokenLine}}.
-begins_with          : {token, {'begins_with', TokenLine}}.
-attribute_exists     : {token, {'attribute_exists', TokenLine}}.
-attribute_not_exists : {token, {'attribute_not_exists', TokenLine}}.
-attribute_empty      : {token, {'attribute_empty', TokenLine}}.
+contains             : {token, {contains, TokenLine}}.
+begins_with          : {token, {begins_with, TokenLine}}.
+attribute_exists     : {token, {attribute_exists, TokenLine}}.
+attribute_not_exists : {token, {attribute_not_exists, TokenLine}}.
+attribute_empty      : {token, {attribute_empty, TokenLine}}.
 
 \$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}.
 \:[a-zA-Z_][a-zA-Z_0-9]* : {token, {substitution, TokenLine, strip_substitution(TokenChars)}}.
diff --git a/src/leveled_filterparser.yrl b/src/leveled_filterparser.yrl
index d9253590..ca060eb9 100644
--- a/src/leveled_filterparser.yrl
+++ b/src/leveled_filterparser.yrl
@@ -9,8 +9,8 @@ Terminals
 '(' ')' comparator identifier string integer
 ','
 'NOT' 'AND' 'OR' 'IN' 'BETWEEN'
-'contains' 'begins_with'
-'attribute_exists' 'attribute_not_exists' 'attribute_empty'.
+contains begins_with
+attribute_exists attribute_not_exists attribute_empty.
 
 
 Rootsymbol top_level.
@@ -18,17 +18,18 @@ Rootsymbol top_level.
 top_level -> condition: {condition, '$1'}.
 
 condition -> operand comparator operand    : {'$2', '$1', '$3'}.
-condition -> operand 'BETWEEN' string 'AND' string   : {'BETWEEN', '$1', '$3', '$5'}.
+condition -> operand 'BETWEEN' string 'AND' string     : {'BETWEEN', '$1', '$3', '$5'}.
 condition -> operand 'BETWEEN' integer 'AND' integer   : {'BETWEEN', '$1', '$3', '$5'}.
 condition -> operand 'IN' operand_list     : {'IN', '$1', '$3'}.
+condition -> operand 'IN' identifier       : {'IN', '$1', '$3'}.  
 condition -> condition 'AND' condition     : {'AND', '$1', '$3'}.
 condition -> condition 'OR' condition      : {'OR', '$1', '$3'}.
 condition -> 'NOT' condition               : {'NOT', '$2'}.
-condition -> 'contains' '(' identifier ',' string ')'       : {'contains', '$3', '$5'}.
-condition -> 'begins_with' '(' identifier ',' string ')'    : {'begins_with', '$3', '$5'}.
-condition -> 'attribute_exists' '(' identifier ')'          : {'attribute_exists', '$3'}.
-condition -> 'attribute_not_exists' '(' identifier ')'      : {'attribute_not_exists', '$3'}.
-condition -> 'attribute_empty' '(' identifier ')'           : {'attribute_empty', '$3'}.
+condition -> contains '(' identifier ',' string ')'       : {contains, '$3', '$5'}.
+condition -> begins_with '(' identifier ',' string ')'    : {begins_with, '$3', '$5'}.
+condition -> attribute_exists '(' identifier ')'          : {attribute_exists, '$3'}.
+condition -> attribute_not_exists '(' identifier ')'      : {attribute_not_exists, '$3'}.
+condition -> attribute_empty '(' identifier ')'           : {attribute_empty, '$3'}.
 condition -> '(' condition ')'             : '$2'.
 
 operand -> identifier   : '$1'.

From 80ce3c7f1a6feacbf19fb579a9703f0d4bb01079 Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Tue, 30 Apr 2024 16:27:22 +0100
Subject: [PATCH 6/8] Update and extend the eval expression

Allow for additional functions in eval.  Change the leveled query API to generalise application of eval/filter expressions.
---
 src/leveled_codec.erl              |  56 +++----
 src/leveled_eval.erl               | 253 ++++++++++++++++++++++++++++-
 src/leveled_evallexer.xrl          |  39 +++--
 src/leveled_evalparser.yrl         |  45 +++--
 src/leveled_filter.erl             |  25 +--
 test/end_to_end/iterator_SUITE.erl |  34 ++--
 test/end_to_end/perf_SUITE.erl     |  15 +-
 7 files changed, 381 insertions(+), 86 deletions(-)

diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index 9270668e..7122a275 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -114,10 +114,14 @@
         lookup|no_lookup.
 -type actual_regex() ::
         reference()|{re_pattern, term(), term(), term(), term()}.
--type capture_filter_fun() :: fun((#{atom() => string()}) -> boolean()).
+-type capture_value() :: binary()|integer().
+-type capture_filter_fun() ::
+        fun((#{binary() => capture_value()}) -> boolean()).
+-type capture_eval_fun() ::
+        fun((binary(), binary()) -> #{binary() => capture_value()}).
 -type capture_reference() :: 
-        {capture, actual_regex(), list(atom()|binary()), capture_filter_fun()}|
-        {delimited, string(), list(binary()), capture_filter_fun()}.
+        {capture, actual_regex(), list(binary()), capture_filter_fun()}|
+        {eval, capture_eval_fun(), capture_filter_fun()}.
 -type regular_expression() ::
         actual_regex()|undefined|capture_reference().
 
@@ -316,35 +320,28 @@ accumulate_index(
         case leveled_util:regex_run(IdxValue, TermRegex, Opts) of
             {match, CptTerms} ->
                 L = min(length(CptTerms), ExpectedKeys),
+                CptMap =
+                    maps:from_list(
+                        lists:zip(
+                            lists:sublist(CptKeys, L),
+                            lists:sublist(CptTerms, L))),
                 check_captured_terms(
-                    lists:sublist(CptTerms, L),
-                    lists:sublist(CptKeys, L),
-                    FilterFun,
-                    AddTerm,
-                    FoldKeysFun,
-                    Bucket,
-                    IdxValue,
-                    ObjKey,
+                    CptMap,
+                    FilterFun, AddTerm, FoldKeysFun,
+                    Bucket, IdxValue, ObjKey,
                     Acc);
             _ ->
                 Acc
         end
     end;
 accumulate_index(
-        {AddTerm, {delimited, Delim, CptKeys, FilterFun}}, FoldKeysFun) ->
-    ExpectedKeys = length(CptKeys),
+        {AddTerm, {eval, EvalFun, FilterFun}}, FoldKeysFun) ->
     fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
-        CptTerms = string:split(IdxValue, Delim, all),
-        L = min(length(CptTerms), ExpectedKeys),
+        CptMap = EvalFun(IdxValue, ObjKey),
         check_captured_terms(
-            lists:sublist(CptTerms, L),
-            lists:sublist(CptKeys, L),
-            FilterFun,
-            AddTerm,
-            FoldKeysFun,
-            Bucket,
-            IdxValue,
-            ObjKey,
+            CptMap,
+            FilterFun, AddTerm, FoldKeysFun,
+            Bucket, IdxValue, ObjKey,
             Acc)
     end;
 accumulate_index({AddTerm, TermRegex}, FoldKeysFun) ->
@@ -363,8 +360,7 @@ accumulate_index({AddTerm, TermRegex}, FoldKeysFun) ->
     end.
 
 check_captured_terms(
-        CptTerms, CptKeys, FilterFun, AddTerm, FoldKeysFun, B, IdxValue, ObjKey, Acc) ->
-    CptMap = maps:from_list(lists:zip(CptKeys, CptTerms)),
+        CptMap, FilterFun, AddTerm, FoldKeysFun, B, IdxValue, ObjKey, Acc) ->
     case FilterFun(CptMap) of
         true ->
             case AddTerm of
@@ -372,9 +368,13 @@ check_captured_terms(
                     FoldKeysFun(B, {IdxValue, ObjKey}, Acc);
                 false ->
                     FoldKeysFun(B, ObjKey, Acc);
-                CptKey when is_atom(CptKey) ->
-                    CptValue = maps:get(CptKey, CptMap),
-                    FoldKeysFun(B, {CptValue, ObjKey}, Acc)
+                CptKey when is_binary(CptKey) ->
+                    case maps:get(CptKey, CptMap, undefined) of
+                        undefined ->
+                            Acc;
+                        CptValue ->  
+                            FoldKeysFun(B, {CptValue, ObjKey}, Acc)
+                    end
             end;
         false ->
             Acc
diff --git a/src/leveled_eval.erl b/src/leveled_eval.erl
index 4a6619f5..3f9e8614 100644
--- a/src/leveled_eval.erl
+++ b/src/leveled_eval.erl
@@ -5,12 +5,13 @@
 
 -module(leveled_eval).
 
--export([apply_eval/4]).
+-export([apply_eval/4, generate_eval_expression/2]).
 
 %%%============================================================================
 %%% External API
 %%%============================================================================
 
+
 apply_eval({eval, Eval}, Term, Key, AttrMap) ->
     apply_eval(Eval, Term, Key, AttrMap);
 apply_eval({'PIPE', Eval1, 'INTO', Eval2}, Term, Key, AttrMap) ->
@@ -40,14 +41,15 @@ apply_eval(
         ),
     maps:put(OutKey, NewTerm, AttrMap);
 apply_eval({
-        split, {identifier, _, InKey}, {string, _, Delim}, {identifier, _, OutKey}},
+        split, {identifier, _, InKey}, DelimAttr, {identifier, _, OutKey}},
         Term, Key, AttrMap) ->
     TermToSplit = term_to_process(InKey, Term, Key, AttrMap),
-    TermList = string:split(TermToSplit, Delim, all),
+    TermList = string:split(TermToSplit, element(3, DelimAttr), all),
     maps:put(OutKey, TermList, AttrMap);
 apply_eval(
-        {slice, {identifier, _, InKey}, {integer, _, Width}, {identifier, _, OutKey}},
+        {slice, {identifier, _, InKey}, WidthAttr, {identifier, _, OutKey}},
         Term, Key, AttrMap) ->
+    {integer, _, Width} = WidthAttr,
     TermToSlice = term_to_process(InKey, Term, Key, AttrMap),
     TermCount = string:length(TermToSlice) div Width,
     TermList =
@@ -68,6 +70,24 @@ apply_eval(
         _ ->
             AttrMap
     end;
+apply_eval(
+        {kvsplit,
+            {identifier, _, InKey},
+            {string, _, DelimPair}, {string, _, DelimKV}},
+        Term, Key, AttrMap) ->
+    TermToSplit = term_to_process(InKey, Term, Key, AttrMap),
+    lists:foldl(
+        fun(S, AccMap) ->
+            case string:split(S, DelimKV, all) of
+                [K, V] ->
+                    maps:put(K, V, AccMap);
+                _ ->
+                    AccMap
+            end
+        end,
+        AttrMap,
+        string:split(TermToSplit, DelimPair, all)
+    );
 apply_eval(
         {to_integer, {identifier, _, InKey}, {identifier, _, OutKey}},
         Term, Key, AttrMap) ->
@@ -77,12 +97,66 @@ apply_eval(
             maps:put(OutKey, I, AttrMap);
         _ ->
             AttrMap
+    end;
+apply_eval(
+        {to_string, {identifier, _, InKey}, {identifier, _, OutKey}},
+        Term, Key, AttrMap) ->
+    case term_to_process(InKey, Term, Key, AttrMap) of
+        TermToConvert when is_integer(TermToConvert) ->
+            maps:put(
+                OutKey,
+                list_to_binary(integer_to_list(TermToConvert)),
+                AttrMap);
+        _ ->
+            AttrMap
+    end;
+apply_eval(
+        {map, InID, Comparator, MapList, Default, OutID},
+        Term, Key, AttrMap) ->
+    {identifier, _, InKey} = InID,
+    {identifier, _, OutKey} = OutID,
+    TermToCompare = term_to_process(InKey, Term, Key, AttrMap),
+    F = reverse_compare_mapping(element(2, Comparator), TermToCompare),
+    case lists:dropwhile(F, MapList) of
+        [] ->
+            maps:put(OutKey, element(3, Default), AttrMap);
+        [{mapping, _T, Assignment}|_Rest] ->
+            maps:put(OutKey, element(3, Assignment), AttrMap)
+    end;
+apply_eval(
+        {MathOp, OperandX, OperandY, {identifier, _, OutKey}},
+        _Term, _Key, AttrMap)
+        when MathOp == add; MathOp == subtract ->
+    X = maybe_fetch_operand(OperandX, AttrMap),
+    Y = maybe_fetch_operand(OperandY, AttrMap),
+    case MathOp of
+        add when is_integer(X), is_integer(Y) ->
+            maps:put(OutKey, X + Y, AttrMap);
+        subtract when is_integer(X), is_integer(Y) ->
+            maps:put(OutKey, X - Y, AttrMap);
+        _ ->
+            AttrMap
+    end.
+
+generate_eval_expression(EvalString, Substitutions) ->
+    {ok, Tokens, _EndLine} = leveled_evallexer:string(EvalString),
+    case leveled_filter:substitute_items(Tokens, Substitutions, []) of
+        {error, Error} ->
+            {error, Error};
+        UpdTokens ->
+            leveled_evalparser:parse(UpdTokens)
     end.
 
+
 %%%============================================================================
 %%% Internal functions
 %%%============================================================================
 
+maybe_fetch_operand({identifier, _, ID}, AttrMap) ->
+    maps:get(ID, AttrMap, 0);
+maybe_fetch_operand(Op, _AttrMap) ->
+    element(3, Op).
+
 term_to_process(<<"term">>, Term, _Key, _AttrMap) ->
     Term;
 term_to_process(<<"key">>, _Term, Key, _AttrMap) ->
@@ -90,6 +164,17 @@ term_to_process(<<"key">>, _Term, Key, _AttrMap) ->
 term_to_process(AttrKey, _Term, _Key, AttrMap) ->
     maps:get(AttrKey, AttrMap, <<"">>).
 
+reverse_compare_mapping('<', Term) ->
+    fun({mapping, T, _A}) -> Term >= element(3, T) end;
+reverse_compare_mapping('<=', Term) ->
+    fun({mapping, T, _A}) -> Term > element(3, T) end;
+reverse_compare_mapping('>', Term) ->
+    fun({mapping, T, _A}) -> Term =< element(3, T) end;
+reverse_compare_mapping('>=', Term) ->
+    fun({mapping, T, _A}) -> Term < element(3, T) end;
+reverse_compare_mapping('=', Term) ->
+    fun({mapping, T, _A}) -> Term =/= element(3, T) end.
+
 %%%============================================================================
 %%% Test
 %%%============================================================================
@@ -232,7 +317,165 @@ basic_test() ->
         % Not changes as not starting with integer
     ?assertMatch(<<"SPURS">>, maps:get(<<"team">>, EvalOut9)),
     ?assertMatch(<<"00001">>, maps:get(<<"number">>, EvalOut9)),
-    ?assertNot(maps:is_key(<<"doh">>, EvalOut9))
+    ?assertNot(maps:is_key(<<"doh">>, EvalOut9)),
+
+    %% Age at 30 April 2024
+    EvalString10 =
+        EvalString5 ++ 
+        " | index($dob, 4, 4, $birthday)"
+        " | map($birthday, <=, ((\"0430\", 2024)), 2023, $yoc)"
+        " | subtract($yoc, $yob, $age)"
+        " | add($age, 1, $age_next)"
+        " | to_string($age, $age)"
+        ,
+    {ok, Tokens10, _EndLine10} = leveled_evallexer:string(EvalString10),
+    {ok, ParsedExp10} = leveled_evalparser:parse(Tokens10),
+    EvalOut10A =
+        apply_eval(
+                ParsedExp10,
+                <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>,
+                <<"9000000001">>,
+                maps:new()
+            ),
+    ?assertMatch(<<"37">>, maps:get(<<"age">>, EvalOut10A)),
+    ?assertMatch(38, maps:get(<<"age_next">>, EvalOut10A)),
+    EvalOut10B =
+        apply_eval(
+                ParsedExp10,
+                <<"SMITH|19860216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>,
+                <<"9000000001">>,
+                maps:new()
+            ),
+    ?assertMatch(<<"38">>, maps:get(<<"age">>, EvalOut10B)),
+    EvalString10F =
+        EvalString1 ++ 
+        " | index($dob, 0, 4, $yob)"
+        " | index($dob, 4, 4, $birthday)"
+        " | map($birthday, <=, ((\"0430\", 2024)), 2023, $yoc)"
+        " | subtract($yoc, $yob, $age)"
+            % yob has not been converted to an integer,
+            % so the age will not be set
+        " | to_string($age, $age)"
+        ,
+    {ok, Tokens10F, _EndLine10F} = leveled_evallexer:string(EvalString10F),
+    {ok, ParsedExp10F} = leveled_evalparser:parse(Tokens10F),
+    EvalOut10F =
+        apply_eval(
+                ParsedExp10F,
+                <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>,
+                <<"9000000001">>,
+                maps:new()
+            ),
+    ?assertNot(maps:is_key(<<"age">>, EvalOut10F)),
+
+    EvalString11A =
+        EvalString1 ++
+            " | map($dob, <, "
+                "((\"1946\", \"Silent\"), (\"1966\", \"Boomer\"),"
+                "(\"1980\", \"GenX\"), (\"1997\", \"Millenial\")), \"GenZ\","
+                " $generation)",
+    {ok, Tokens11A, _EndLine11A} = leveled_evallexer:string(EvalString11A),
+    {ok, ParsedExp11A} = leveled_evalparser:parse(Tokens11A),
+    EvalOut11A =
+        apply_eval(
+                ParsedExp11A,
+                <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>,
+                <<"9000000001">>,
+                maps:new()
+            ),
+    ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11A)),
+    EvalString11B =
+        EvalString1 ++
+            " | map($dob, <=, "
+                "((\"1945\", \"Silent\"), (\"1965\", \"Boomer\"),"
+                "(\"1979\", \"GenX\"), (\"1996\", \"Millenial\")), \"GenZ\","
+                " $generation)",
+    {ok, Tokens11B, _EndLine11B} = leveled_evallexer:string(EvalString11B),
+    {ok, ParsedExp11B} = leveled_evalparser:parse(Tokens11B),
+    EvalOut11B =
+        apply_eval(
+                ParsedExp11B,
+                <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>,
+                <<"9000000001">>,
+                maps:new()
+            ),
+    ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11B)),
+    EvalString11C =
+        EvalString1 ++
+            " | map($dob, >, "
+                "((\"1996\", \"GenZ\"), (\"1979\", \"Millenial\"),"
+                "(\"1965\", \"GenX\"), (\"1945\", \"Boomer\")), \"Silent\","
+                " $generation)",
+    {ok, Tokens11C, _EndLine11C} = leveled_evallexer:string(EvalString11C),
+    {ok, ParsedExp11C} = leveled_evalparser:parse(Tokens11C),
+    EvalOut11C =
+        apply_eval(
+                ParsedExp11C,
+                <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>,
+                <<"9000000001">>,
+                maps:new()
+            ),
+    ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11C)),
+    EvalString11D =
+        EvalString1 ++
+            " | map($dob, >=, "
+                "((\"1997\", \"GenZ\"), (\"1980\", \"Millenial\"),"
+                "(\"1966\", \"GenX\"), (\"1946\", \"Boomer\")), \"Silent\","
+                " $generation)",
+    {ok, Tokens11D, _EndLine11D} = leveled_evallexer:string(EvalString11D),
+    {ok, ParsedExp11D} = leveled_evalparser:parse(Tokens11D),
+    EvalOut11D =
+        apply_eval(
+                ParsedExp11D,
+                <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>,
+                <<"9000000001">>,
+                maps:new()
+            ),
+    ?assertMatch(<<"Millenial">>, maps:get(<<"generation">>, EvalOut11D)),
+
+    EvalString12 =
+        "kvsplit($term, \"|\", \"=\") | index($term, 0, 12, $ts) |"
+        " to_integer($ts, $ts) |"
+        " to_integer($DEBUG, $DEBUG) |"
+        " to_integer($INFO, $INFO) |"
+        " to_integer($WARN, $WARN) |"
+        " to_integer($ERROR, $ERROR) |"
+        " to_integer($CRITICAL, $CRITICAL) |"
+        " add($DEBUG, $INFO, $TOTAL) |"
+        " add($TOTAL, $WARN, $TOTAL) |"
+        " add($TOTAL, $ERROR, $TOTAL) |"
+        " add($TOTAL, $CRITICAL, $TOTAL)"
+        ,
+    {ok, Tokens12, _EndLine12} = leveled_evallexer:string(EvalString12),
+    {ok, ParsedExp12} = leveled_evalparser:parse(Tokens12),
+    EvalOut12 =
+        apply_eval(
+                ParsedExp12,
+                <<"063881703147|DEBUG=804|INFO=186|WARN=10">>,
+                <<"ABC1233">>,
+                maps:new()
+            ),
+    ?assertMatch(63881703147, maps:get(<<"ts">>, EvalOut12)),
+    ?assertMatch(1000, maps:get(<<"TOTAL">>, EvalOut12)),
+    ?assertNot(maps:is_key(<<"CRITICAL">>, EvalOut12)),
+
+    EvalString13 =
+        "kvsplit($term, \"|\", \":\") |"
+        " map($cup_year, =, "
+            "((\"1965\", \"bad\"), (\"1970\", \"bad\"), "
+            "(\"1972\", \"good\"), (\"1974\", \"bad\")), "
+            "\"indifferent\", $cup_happy) ",
+    {ok, Tokens13, _EndLine13} = leveled_evallexer:string(EvalString13),
+    {ok, ParsedExp13} = leveled_evalparser:parse(Tokens13),
+    EvalOut13A =
+        apply_eval(ParsedExp13, <<"cup_year:1972">>, <<"ABC1">>, maps:new()),
+    ?assertMatch(<<"good">>, maps:get(<<"cup_happy">>, EvalOut13A)),
+    EvalOut13B =
+        apply_eval(ParsedExp13, <<"cup_year:1970">>, <<"ABC1">>, maps:new()),
+    ?assertMatch(<<"bad">>, maps:get(<<"cup_happy">>, EvalOut13B)),
+    EvalOut13C =
+        apply_eval(ParsedExp13, <<"cup_year:2024">>, <<"ABC1">>, maps:new()),
+    ?assertMatch(<<"indifferent">>, maps:get(<<"cup_happy">>, EvalOut13C))
     .
 
 
diff --git a/src/leveled_evallexer.xrl b/src/leveled_evallexer.xrl
index 29e65fbf..dcee00fc 100644
--- a/src/leveled_evallexer.xrl
+++ b/src/leveled_evallexer.xrl
@@ -7,19 +7,32 @@ Rules.
 
 {WhiteSpace} : skip_token.
 
-\( : {token, {'(', TokenLine}}.
-\) : {token, {')', TokenLine}}.
-,  : {token, {',', TokenLine}}.
-\| : {token, {'PIPE', TokenLine}}.
-
-delim       : {token, {delim, TokenLine}}.
-join        : {token, {join, TokenLine}}.
-split       : {token, {split, TokenLine}}.
-slice       : {token, {slice, TokenLine}}.
-index       : {token, {index, TokenLine}}.
-to_integer  : {token, {to_integer, TokenLine}}.
+\(      : {token, {'(', TokenLine}}.
+\)      : {token, {')', TokenLine}}.
+,       : {token, {',', TokenLine}}.
+\|      : {token, {'PIPE', TokenLine}}.
+
+delim        : {token, {delim, TokenLine}}.
+join         : {token, {join, TokenLine}}.
+split        : {token, {split, TokenLine}}.
+slice        : {token, {slice, TokenLine}}.
+index        : {token, {index, TokenLine}}.
+kvsplit      : {token, {kvsplit, TokenLine}}.
+to_integer   : {token, {to_integer, TokenLine}}.
+to_string    : {token, {to_string, TokenLine}}.
+add          : {token, {add, TokenLine}}.
+subtract     : {token, {subtract, TokenLine}}.
+map          : {token, {map, TokenLine}}.
+
+=       : {token, {comparator, '=', TokenLine}}.
+<       : {token, {comparator, '<', TokenLine}}.
+>       : {token, {comparator, '>', TokenLine}}.
+<>      : {token, {comparator, '<>', TokenLine}}.
+<=      : {token, {comparator, '<=', TokenLine}}.
+>=      : {token, {comparator, '>=', TokenLine}}.
 
 \$[a-zA-Z_][a-zA-Z_0-9]* : {token, {identifier, TokenLine, strip_identifier(TokenChars)}}.
+\:[a-zA-Z_][a-zA-Z_0-9]* : {token, {substitution, TokenLine, strip_substitution(TokenChars)}}.
 \"[^"]*\"                : {token, {string, TokenLine, strip_string(TokenChars)}}. %"
 [0-9]+                   : {token, {integer, TokenLine, list_to_integer(TokenChars)}}.
 
@@ -30,4 +43,8 @@ strip_string(TokenChars) ->
 
 strip_identifier(TokenChars) ->
     [36|StrippedChars] = TokenChars,
+    list_to_binary(StrippedChars).
+
+strip_substitution(TokenChars) ->
+    [58|StrippedChars] = TokenChars,
     list_to_binary(StrippedChars).
\ No newline at end of file
diff --git a/src/leveled_evalparser.yrl b/src/leveled_evalparser.yrl
index 43a1b2ab..be475980 100644
--- a/src/leveled_evalparser.yrl
+++ b/src/leveled_evalparser.yrl
@@ -1,27 +1,50 @@
 %% Grammar for eval expressions
 
 Nonterminals
-top_level eval identifiers identifier_list.
+top_level eval
+operand math_operand
+mapping mappings mappings_list
+identifiers identifier_list.
 
 Terminals
 '(' ')' ','
-identifier string integer
+identifier string integer comparator
 'PIPE'
-delim join split slice index to_integer.
+delim join split slice index kvsplit map
+add subtract
+to_integer to_string.
 
 Rootsymbol top_level.
 
 top_level -> eval: {eval, '$1'}.
 
-eval -> eval 'PIPE' eval                                                     : {'PIPE', '$1', 'INTO', '$3'}.
-eval -> delim '(' identifier ',' string ',' identifier_list ')'              : {delim, '$3', '$5', '$7'}.
-eval -> join '(' identifier_list ',' string ',' identifier ')'               : {join, '$3', '$5', '$7'}.
-eval -> split '(' identifier ',' string ',' identifier ')'                   : {split, '$3', '$5', '$7'}.
-eval -> slice '(' identifier ',' integer ',' identifier ')'                  : {slice, '$3', '$5', '$7'}.
-eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')'    : {index, '$3', '$5', '$7', '$9'}.
-eval -> to_integer '(' identifier ',' 'identifier' ')'                       : {to_integer, '$3', '$5'}.
+eval -> eval 'PIPE' eval                                                                           : {'PIPE', '$1', 'INTO', '$3'}.
+eval -> delim '(' identifier ',' string ',' identifier_list ')'                                    : {delim, '$3', '$5', '$7'}.
+eval -> join '(' identifier_list ',' string ',' identifier ')'                                     : {join, '$3', '$5', '$7'}.
+eval -> split '(' identifier ',' string ',' identifier ')'                                         : {split, '$3', '$5', '$7'}.
+eval -> slice '(' identifier ',' integer ',' identifier ')'                                        : {slice, '$3', '$5', '$7'}.
+eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')'                          : {index, '$3', '$5', '$7', '$9'}.
+eval -> kvsplit '(' identifier ',' string ',' string ')'                                           : {kvsplit, '$3', '$5', '$7'}.
+eval -> map '(' identifier ',' comparator ',' mappings_list ',' operand ',' identifier ')'         : {map, '$3', '$5', '$7', '$9', '$11'}.
+eval -> to_integer '(' identifier ',' identifier ')'                                               : {to_integer, '$3', '$5'}.
+eval -> to_string '(' identifier ',' identifier ')'                                                : {to_string, '$3', '$5'}.
+eval -> subtract '(' math_operand ',' math_operand ',' identifier ')'                              : {subtract, '$3', '$5', '$7'}.
+eval -> add '(' math_operand ',' math_operand ',' identifier ')'                                   : {add, '$3', '$5', '$7'}.
 
-identifier_list -> '(' identifiers ')' : strip_ids('$2').
+mappings_list -> '(' mappings ')'         : '$2'.
+
+mappings -> mapping ',' mappings          : ['$1' | '$3'].
+mappings -> mapping                       : ['$1'].
+
+mapping -> '(' operand ',' operand ')' : {mapping, '$2', '$4'}.
+
+operand -> string          : '$1'.
+operand -> integer         : '$1'.
+
+math_operand -> integer    : '$1'.
+math_operand -> identifier : '$1'.
+
+identifier_list -> '(' identifiers ')'    : strip_ids('$2').
 
 identifiers -> identifier ',' identifiers : ['$1' | '$3'].
 identifiers -> identifier                 : ['$1'].
diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl
index 0e9515fd..5d17627f 100644
--- a/src/leveled_filter.erl
+++ b/src/leveled_filter.erl
@@ -8,7 +8,8 @@
 -export(
     [
         generate_filter_expression/2,
-        apply_filter/2
+        apply_filter/2,
+        substitute_items/3
     ]).
 
 %%%============================================================================
@@ -110,17 +111,6 @@ generate_filter_expression(FilterString, Substitutions) ->
             leveled_filterparser:parse(UpdTokens)
     end.
 
-%%%============================================================================
-%%% Internal functions
-%%%============================================================================
-
-compare('>', V, CmpA) -> V > CmpA;
-compare('>=', V, CmpA) -> V >= CmpA;
-compare('<', V, CmpA) -> V < CmpA;
-compare('<=', V, CmpA) -> V =< CmpA;
-compare('=', V, CmpA) -> V == CmpA;
-compare('<>', V, CmpA) -> V =/= CmpA.
-
 substitute_items([], _Subs, UpdTokens) ->
     lists:reverse(UpdTokens);
 substitute_items([{substitution, LN, ID}|Rest], Subs, UpdTokens) ->
@@ -142,6 +132,17 @@ substitute_items([{substitution, LN, ID}|Rest], Subs, UpdTokens) ->
 substitute_items([Token|Rest], Subs, UpdTokens) ->
     substitute_items(Rest, Subs, [Token|UpdTokens]).
 
+%%%============================================================================
+%%% Internal functions
+%%%============================================================================
+
+compare('>', V, CmpA) -> V > CmpA;
+compare('>=', V, CmpA) -> V >= CmpA;
+compare('<', V, CmpA) -> V < CmpA;
+compare('<=', V, CmpA) -> V =< CmpA;
+compare('=', V, CmpA) -> V == CmpA;
+compare('<>', V, CmpA) -> V =/= CmpA.
+
 
 %%%============================================================================
 %%% Test
diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl
index 955c036e..be0cb71b 100644
--- a/test/end_to_end/iterator_SUITE.erl
+++ b/test/end_to_end/iterator_SUITE.erl
@@ -879,7 +879,7 @@ capture_and_filter_terms(_Config) ->
         "[^\\|]*#LS[^\\|]*",
     FilterFun1 =
         fun(Captures) ->
-            DoB = maps:get(dob, Captures),
+            DoB = maps:get(<<"dob">>, Captures),
             (DoB >= StartDoB) andalso (DoB =< EndDoB)
         end,
     {ok, WillowLeedsExtractorPCRE} = re:compile(WillowLeedsExtractor),
@@ -889,7 +889,7 @@ capture_and_filter_terms(_Config) ->
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {false,
-                {capture, WillowLeedsExtractorPCRE, [dob], FilterFun1}}
+                {capture, WillowLeedsExtractorPCRE, [<<"dob">>], FilterFun1}}
     },
     {async, RunnerPCRE1} = leveled_bookie:book_returnfolder(Book1, QueryPCRE1),
     BornMid70sPCRE1 = RunnerPCRE1(),
@@ -903,20 +903,21 @@ capture_and_filter_terms(_Config) ->
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {false,
-                {capture, WillowLeedsExtractorRE2, [dob], FilterFun1}}
+                {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}}
     },
     {async, RunnerRE2_2} = leveled_bookie:book_returnfolder(Book1, QueryRE2_2),
     BornMid70sRE2_2 = RunnerRE2_2(),
 
     SW3 = os:timestamp(),
 
+    AllFun = fun(_) -> true end,
     QueryRE2_3 =
         {index_query,
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {dob,
-                {capture, WillowLeedsExtractorRE2, [dob], fun(_) -> true end}}
+            {<<"dob">>,
+                {capture, WillowLeedsExtractorRE2, [<<"dob">>], AllFun}}
     },
     {async, RunnerRE2_3} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3),
     Results3 = RunnerRE2_3(),
@@ -941,7 +942,7 @@ capture_and_filter_terms(_Config) ->
     
     FilterFun2 =
         fun(Captures) ->
-            DoB = maps:get(dob, Captures),
+            DoB = maps:get(<<"dob">>, Captures),
             (DoB >= StartDoB) andalso (DoB =< EndDoB)
         end,
     QueryRE2_4 =
@@ -952,7 +953,7 @@ capture_and_filter_terms(_Config) ->
             {false,
                 {capture,
                     element(2, re2:compile(WillowLeedsDoubleExtractor)),
-                    [dob, dod],
+                    [<<"dob">>, <<"dod">>],
                     FilterFun2}}
     },
     {async, RunnerRE2_4} = leveled_bookie:book_returnfolder(Book1, QueryRE2_4),
@@ -966,7 +967,7 @@ capture_and_filter_terms(_Config) ->
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {true,
-                {capture, WillowLeedsExtractorRE2, [dob], FilterFun1}}
+                {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}}
     },
     {async, RunnerRE2_5} = leveled_bookie:book_returnfolder(Book1, QueryRE2_5),
     BornMid70sRE2_5 =
@@ -1039,16 +1040,23 @@ capture_and_filter_terms(_Config) ->
         fun(AttrMap) ->
             leveled_filter:apply_filter(FilterExpressionParsed2, AttrMap)
         end,
-        
+    {ok, EvalExpression2} =
+        leveled_eval:generate_eval_expression(
+            "delim($term, \"|\", ($surname, $dob, $dod, $gns, $pcs))",
+            maps:new()
+        ),
+    EvalFun2 =
+        fun(Term, Key) ->
+            leveled_eval:apply_eval(EvalExpression2, Term, Key, maps:new())
+        end,
     QueryRE2_10 =
         {index_query,
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
             {false,
-                {delimited,
-                    "|",
-                    [<<"surname">>, <<"dob">>, <<"dod">>, <<"gns">>, <<"pcs">>],
+                {eval,
+                    EvalFun2,
                     FilterFun6}}
     },
     {async, RunnerRE2_10} = leveled_bookie:book_returnfolder(Book1, QueryRE2_10),
@@ -1089,7 +1097,7 @@ capture_and_filter_terms(_Config) ->
         "~nRE2 single-capture pre-filter with parsed query string took ~w ms~n",
         [timer:now_diff(SW10, SW9) div 1000]),
     maybe_log_toscreen(
-        "~nDelimited index with parsed filter expression took ~w ms~n",
+        "~nEval processed index with parsed filter expression took ~w ms~n",
         [timer:now_diff(SW11, SW10) div 1000]),
 
     ok = leveled_bookie:book_close(Book1),
diff --git a/test/end_to_end/perf_SUITE.erl b/test/end_to_end/perf_SUITE.erl
index a4606292..e4c54117 100644
--- a/test/end_to_end/perf_SUITE.erl
+++ b/test/end_to_end/perf_SUITE.erl
@@ -582,6 +582,14 @@ random_people_queries(true, Bookie, Bucket, IndexesReturned) ->
             FilterExpression, maps:new()),
     FilterFun =
         fun(AttrMap) -> leveled_filter:apply_filter(ParsedFilter, AttrMap) end,
+    EvalExpression = "delim($term, \"|\", ($surname, $dob, $dod, $gns, $pcs))",
+    {ok, ParsedEval} =
+        leveled_eval:generate_eval_expression(EvalExpression, maps:new()),
+    EvalFun =
+        fun(Term, Key) ->
+            leveled_eval:apply_eval(ParsedEval, Term, Key, maps:new())
+        end,
+        
     QueryFun =
         fun() ->
             Surname = get_random_surname(),
@@ -597,12 +605,7 @@ random_people_queries(true, Bookie, Bucket, IndexesReturned) ->
                     {Bucket, <<>>}, 
                     {FoldKeysFun, 0},
                     Range,
-                    {true,
-                        {delimited,
-                            "|",
-                            [<<"surname">>, <<"dob">>, <<"dod">>, <<"gns">>, <<"pcs">>],
-                            FilterFun
-                        }
+                    {true, {eval, EvalFun, FilterFun}
                     }),
             R()
         end,

From 1563a7c90f21285ccffb57850f08058086e7470f Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Tue, 30 Apr 2024 21:46:29 +0100
Subject: [PATCH 7/8] Extend testing

---
 src/leveled_eval.erl               | 17 +++++++++++++++++
 src/leveled_filter.erl             |  2 +-
 test/end_to_end/iterator_SUITE.erl | 12 ++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/leveled_eval.erl b/src/leveled_eval.erl
index 3f9e8614..8084af2a 100644
--- a/src/leveled_eval.erl
+++ b/src/leveled_eval.erl
@@ -479,5 +479,22 @@ basic_test() ->
     .
 
 
+generate_test() ->
+    EvalString13 =
+        "kvsplit($term, \"|\", \":\") |"
+        " map($cup_year, =, "
+            "((\"1965\", \"bad\"), (\"1970\", \"bad\"), "
+            "(:clarke, \"good\"), (\"1974\", \"bad\")), "
+            "\"indifferent\", $cup_happy) ",
+    {ok, ParsedExp13} =
+        generate_eval_expression(EvalString13, #{<<"clarke">> => <<"1972">>}),
+    io:format("ParsedExp ~p~n", [ParsedExp13]),
+    EvalOut13A =
+        apply_eval(ParsedExp13, <<"cup_year:1972">>, <<"ABC1">>, maps:new()),
+    ?assertMatch(<<"good">>, maps:get(<<"cup_happy">>, EvalOut13A)),
+    ?assertMatch(
+        {error, "Substitution <<\"clarke\">> not found"},
+        generate_eval_expression(EvalString13, maps:new())
+    ).
 
 -endif.
\ No newline at end of file
diff --git a/src/leveled_filter.erl b/src/leveled_filter.erl
index 5d17627f..7761d4f4 100644
--- a/src/leveled_filter.erl
+++ b/src/leveled_filter.erl
@@ -121,7 +121,7 @@ substitute_items([{substitution, LN, ID}|Rest], Subs, UpdTokens) ->
                     io_lib:format("Substitution ~p not found", [ID]))};
         Value when is_binary(Value) ->
             substitute_items(
-                Rest, Subs, [{string, LN, binary_to_list(Value)}|UpdTokens]);
+                Rest, Subs, [{string, LN, Value}|UpdTokens]);
         Value when is_integer(Value) ->
             substitute_items(Rest, Subs, [{integer, LN, Value}|UpdTokens]);
         _UnexpectedValue ->
diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl
index be0cb71b..cb7a5a4a 100644
--- a/test/end_to_end/iterator_SUITE.erl
+++ b/test/end_to_end/iterator_SUITE.erl
@@ -1100,6 +1100,18 @@ capture_and_filter_terms(_Config) ->
         "~nEval processed index with parsed filter expression took ~w ms~n",
         [timer:now_diff(SW11, SW10) div 1000]),
 
+    QueryRE2_3_WrongCapture =
+        {index_query,
+            {Bucket, null},
+            {fun testutil:foldkeysfun/3, []},
+            {IdxName, <<"M">>, <<"Z">>},
+            {<<"gns">>,
+                {capture, WillowLeedsExtractorRE2, [<<"dob">>], AllFun}}
+        },
+    {async, RunnerRE2_3_WC} =
+        leveled_bookie:book_returnfolder(Book1, QueryRE2_3_WrongCapture),
+    true = [] == RunnerRE2_3_WC(),
+
     ok = leveled_bookie:book_close(Book1),
     
     testutil:reset_filestructure().

From 59c556afa3ded9ad5f244c2f35226f40acecee5d Mon Sep 17 00:00:00 2001
From: Martin Sumner <martin.sumner@adaptip.co.uk>
Date: Wed, 1 May 2024 16:01:19 +0100
Subject: [PATCH 8/8] Add regex eval function to pipeline

Remove `capture` option - as all can be achieved via eval/filter.
---
 src/leveled_codec.erl              | 23 --------
 src/leveled_eval.erl               | 88 +++++++++++++++++++++++++-----
 src/leveled_evallexer.xrl          |  3 +
 src/leveled_evalparser.yrl         | 45 +++++++++------
 src/leveled_util.erl               | 10 +++-
 test/end_to_end/iterator_SUITE.erl | 71 ++++++++++++------------
 6 files changed, 151 insertions(+), 89 deletions(-)

diff --git a/src/leveled_codec.erl b/src/leveled_codec.erl
index 7122a275..52a8b9d1 100644
--- a/src/leveled_codec.erl
+++ b/src/leveled_codec.erl
@@ -120,7 +120,6 @@
 -type capture_eval_fun() ::
         fun((binary(), binary()) -> #{binary() => capture_value()}).
 -type capture_reference() :: 
-        {capture, actual_regex(), list(binary()), capture_filter_fun()}|
         {eval, capture_eval_fun(), capture_filter_fun()}.
 -type regular_expression() ::
         actual_regex()|undefined|capture_reference().
@@ -312,28 +311,6 @@ accumulate_index({true, undefined}, FoldKeysFun) ->
     fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
         FoldKeysFun(Bucket, {IdxValue, ObjKey}, Acc)
     end;
-accumulate_index(
-        {AddTerm, {capture, TermRegex, CptKeys, FilterFun}}, FoldKeysFun) ->
-    ExpectedKeys = length(CptKeys),
-    Opts = [{capture, all_but_first, binary}],
-    fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
-        case leveled_util:regex_run(IdxValue, TermRegex, Opts) of
-            {match, CptTerms} ->
-                L = min(length(CptTerms), ExpectedKeys),
-                CptMap =
-                    maps:from_list(
-                        lists:zip(
-                            lists:sublist(CptKeys, L),
-                            lists:sublist(CptTerms, L))),
-                check_captured_terms(
-                    CptMap,
-                    FilterFun, AddTerm, FoldKeysFun,
-                    Bucket, IdxValue, ObjKey,
-                    Acc);
-            _ ->
-                Acc
-        end
-    end;
 accumulate_index(
         {AddTerm, {eval, EvalFun, FilterFun}}, FoldKeysFun) ->
     fun({?IDX_TAG, Bucket, {_IdxFld, IdxValue}, ObjKey}, _Value, Acc) ->
diff --git a/src/leveled_eval.erl b/src/leveled_eval.erl
index 8084af2a..4f3b3b46 100644
--- a/src/leveled_eval.erl
+++ b/src/leveled_eval.erl
@@ -5,12 +5,27 @@
 
 -module(leveled_eval).
 
--export([apply_eval/4, generate_eval_expression/2]).
+-export([apply_eval/4, generate_eval_expression/2, generate_eval_function/2]).
 
 %%%============================================================================
 %%% External API
 %%%============================================================================
 
+generate_eval_function(EvalString, Substitutions) ->
+    {ok, ParsedEval} = generate_eval_expression(EvalString, Substitutions),
+    fun(Term, Key) ->
+        apply_eval(ParsedEval, Term, Key, maps:new())
+    end.
+
+generate_eval_expression(EvalString, Substitutions) ->
+    {ok, Tokens, _EndLine} = leveled_evallexer:string(EvalString),
+    case leveled_filter:substitute_items(Tokens, Substitutions, []) of
+        {error, Error} ->
+            {error, Error};
+        UpdTokens ->
+            leveled_evalparser:parse(UpdTokens)
+    end.
+
 
 apply_eval({eval, Eval}, Term, Key, AttrMap) ->
     apply_eval(Eval, Term, Key, AttrMap);
@@ -136,17 +151,26 @@ apply_eval(
             maps:put(OutKey, X - Y, AttrMap);
         _ ->
             AttrMap
-    end.
-
-generate_eval_expression(EvalString, Substitutions) ->
-    {ok, Tokens, _EndLine} = leveled_evallexer:string(EvalString),
-    case leveled_filter:substitute_items(Tokens, Substitutions, []) of
-        {error, Error} ->
-            {error, Error};
-        UpdTokens ->
-            leveled_evalparser:parse(UpdTokens)
-    end.
-
+    end;
+apply_eval(
+        {regex, {identifier, _, InKey}, CompiledRE, ExpKeys},
+        Term, Key, AttrMap) ->
+    TermToCapture = term_to_process(InKey, Term, Key, AttrMap),
+    ExpectedKeyLength = length(ExpKeys),
+    Opts = [{capture, all_but_first, binary}],
+    case leveled_util:regex_run(TermToCapture, CompiledRE, Opts) of
+        {match, CptTerms} ->
+            L = min(length(CptTerms), ExpectedKeyLength),
+            CptMap =
+                maps:from_list(
+                    lists:zip(
+                        lists:sublist(ExpKeys, L),
+                        lists:sublist(CptTerms, L))),
+            maps:merge(AttrMap, CptMap);
+        _ ->
+            AttrMap
+    end
+    .
 
 %%%============================================================================
 %%% Internal functions
@@ -475,10 +499,47 @@ basic_test() ->
     ?assertMatch(<<"bad">>, maps:get(<<"cup_happy">>, EvalOut13B)),
     EvalOut13C =
         apply_eval(ParsedExp13, <<"cup_year:2024">>, <<"ABC1">>, maps:new()),
-    ?assertMatch(<<"indifferent">>, maps:get(<<"cup_happy">>, EvalOut13C))
+    ?assertMatch(<<"indifferent">>, maps:get(<<"cup_happy">>, EvalOut13C)),
+
+    ExtractRegex =
+        "(?P<fn>[^\\|]*)\\|(?P<dob>[0-9]{8})\\|(?P<dod>[0-9]{0,8})\\|"
+        "(?P<gns>[^\\|]*)\\|(?P<pcs>[^\\|]*)|.",
+    ok =
+        check_regex_eval(
+            "regex($term, :regex, re2, ($fn, $dob, $dod, $gns, $pcs))",
+            ExtractRegex
+        ),
+    ok =
+        check_regex_eval(
+            "regex($term, :regex, pcre, ($fn, $dob, $dod, $gns, $pcs))",
+            ExtractRegex
+        ),
+    ok =
+        check_regex_eval(
+            "regex($term, :regex, ($fn, $dob, $dod, $gns, $pcs))",
+            ExtractRegex
+        )        
+    
     .
 
 
+check_regex_eval(EvalString14, ExtractRegex) ->
+    {ok, ParsedExp14} =
+        generate_eval_expression(
+            EvalString14,
+            #{<<"regex">> => list_to_binary(ExtractRegex)}  
+        ),
+    EvalOut14 =
+        apply_eval(
+            ParsedExp14,
+            <<"SMITH|19861216||Willow#Mia#Vera|LS1 4BT#LS8 1ZZ">>,
+            <<"9000000001">>,
+            maps:new()
+        ),
+    ?assertMatch(<<"SMITH">>, maps:get(<<"fn">>, EvalOut14)),
+    ok.
+
+
 generate_test() ->
     EvalString13 =
         "kvsplit($term, \"|\", \":\") |"
@@ -488,7 +549,6 @@ generate_test() ->
             "\"indifferent\", $cup_happy) ",
     {ok, ParsedExp13} =
         generate_eval_expression(EvalString13, #{<<"clarke">> => <<"1972">>}),
-    io:format("ParsedExp ~p~n", [ParsedExp13]),
     EvalOut13A =
         apply_eval(ParsedExp13, <<"cup_year:1972">>, <<"ABC1">>, maps:new()),
     ?assertMatch(<<"good">>, maps:get(<<"cup_happy">>, EvalOut13A)),
diff --git a/src/leveled_evallexer.xrl b/src/leveled_evallexer.xrl
index dcee00fc..5944445c 100644
--- a/src/leveled_evallexer.xrl
+++ b/src/leveled_evallexer.xrl
@@ -18,11 +18,14 @@ split        : {token, {split, TokenLine}}.
 slice        : {token, {slice, TokenLine}}.
 index        : {token, {index, TokenLine}}.
 kvsplit      : {token, {kvsplit, TokenLine}}.
+regex        : {token, {regex, TokenLine}}.
 to_integer   : {token, {to_integer, TokenLine}}.
 to_string    : {token, {to_string, TokenLine}}.
 add          : {token, {add, TokenLine}}.
 subtract     : {token, {subtract, TokenLine}}.
 map          : {token, {map, TokenLine}}.
+pcre         : {token, {pcre, TokenLine}}.
+re2          : {token, {re2, TokenLine}}.
 
 =       : {token, {comparator, '=', TokenLine}}.
 <       : {token, {comparator, '<', TokenLine}}.
diff --git a/src/leveled_evalparser.yrl b/src/leveled_evalparser.yrl
index be475980..58384aa4 100644
--- a/src/leveled_evalparser.yrl
+++ b/src/leveled_evalparser.yrl
@@ -2,7 +2,7 @@
 
 Nonterminals
 top_level eval
-operand math_operand
+operand math_operand regex_method
 mapping mappings mappings_list
 identifiers identifier_list.
 
@@ -10,26 +10,29 @@ Terminals
 '(' ')' ','
 identifier string integer comparator
 'PIPE'
-delim join split slice index kvsplit map
+delim join split slice index kvsplit regex map
 add subtract
-to_integer to_string.
+to_integer to_string
+pcre re2.
 
 Rootsymbol top_level.
 
 top_level -> eval: {eval, '$1'}.
 
-eval -> eval 'PIPE' eval                                                                           : {'PIPE', '$1', 'INTO', '$3'}.
-eval -> delim '(' identifier ',' string ',' identifier_list ')'                                    : {delim, '$3', '$5', '$7'}.
-eval -> join '(' identifier_list ',' string ',' identifier ')'                                     : {join, '$3', '$5', '$7'}.
-eval -> split '(' identifier ',' string ',' identifier ')'                                         : {split, '$3', '$5', '$7'}.
-eval -> slice '(' identifier ',' integer ',' identifier ')'                                        : {slice, '$3', '$5', '$7'}.
-eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')'                          : {index, '$3', '$5', '$7', '$9'}.
-eval -> kvsplit '(' identifier ',' string ',' string ')'                                           : {kvsplit, '$3', '$5', '$7'}.
-eval -> map '(' identifier ',' comparator ',' mappings_list ',' operand ',' identifier ')'         : {map, '$3', '$5', '$7', '$9', '$11'}.
-eval -> to_integer '(' identifier ',' identifier ')'                                               : {to_integer, '$3', '$5'}.
-eval -> to_string '(' identifier ',' identifier ')'                                                : {to_string, '$3', '$5'}.
-eval -> subtract '(' math_operand ',' math_operand ',' identifier ')'                              : {subtract, '$3', '$5', '$7'}.
-eval -> add '(' math_operand ',' math_operand ',' identifier ')'                                   : {add, '$3', '$5', '$7'}.
+eval -> eval 'PIPE' eval                                                                     : {'PIPE', '$1', 'INTO', '$3'}.
+eval -> delim '(' identifier ',' string ',' identifier_list ')'                              : {delim, '$3', '$5', '$7'}.
+eval -> join '(' identifier_list ',' string ',' identifier ')'                               : {join, '$3', '$5', '$7'}.
+eval -> split '(' identifier ',' string ',' identifier ')'                                   : {split, '$3', '$5', '$7'}.
+eval -> slice '(' identifier ',' integer ',' identifier ')'                                  : {slice, '$3', '$5', '$7'}.
+eval -> index '(' identifier ',' integer ',' integer ',' 'identifier' ')'                    : {index, '$3', '$5', '$7', '$9'}.
+eval -> kvsplit '(' identifier ',' string ',' string ')'                                     : {kvsplit, '$3', '$5', '$7'}.
+eval -> regex '(' identifier ',' string ',' regex_method ',' identifier_list ')'             : {regex, '$3', re_compile('$5', '$7'), '$9'}.
+eval -> regex '(' identifier ',' string ',' identifier_list ')'                              : {regex, '$3', re_compile('$5'), '$7'}.
+eval -> map '(' identifier ',' comparator ',' mappings_list ',' operand ',' identifier ')'   : {map, '$3', '$5', '$7', '$9', '$11'}.
+eval -> to_integer '(' identifier ',' identifier ')'                                         : {to_integer, '$3', '$5'}.
+eval -> to_string '(' identifier ',' identifier ')'                                          : {to_string, '$3', '$5'}.
+eval -> subtract '(' math_operand ',' math_operand ',' identifier ')'                        : {subtract, '$3', '$5', '$7'}.
+eval -> add '(' math_operand ',' math_operand ',' identifier ')'                             : {add, '$3', '$5', '$7'}.
 
 mappings_list -> '(' mappings ')'         : '$2'.
 
@@ -44,6 +47,9 @@ operand -> integer         : '$1'.
 math_operand -> integer    : '$1'.
 math_operand -> identifier : '$1'.
 
+regex_method -> pcre       : '$1'.    
+regex_method -> re2        : '$1'.
+
 identifier_list -> '(' identifiers ')'    : strip_ids('$2').
 
 identifiers -> identifier ',' identifiers : ['$1' | '$3'].
@@ -59,4 +65,11 @@ strip_ids(IDL) ->
     lists:map(
         fun(ID) -> element(3, ID) end,
         lists:flatten(IDL)
-    ). 
\ No newline at end of file
+    ).
+
+re_compile(RegexStr) ->
+    re_compile(RegexStr, {re2, element(2, RegexStr)}).
+
+re_compile({string, _LN, Regex}, Method) ->
+    {ok, CRE} = leveled_util:regex_compile(Regex, element(1, Method)),
+    CRE.
\ No newline at end of file
diff --git a/src/leveled_util.erl b/src/leveled_util.erl
index 9f82fdac..fdf32ae5 100644
--- a/src/leveled_util.erl
+++ b/src/leveled_util.erl
@@ -12,7 +12,8 @@
             t2b/1,
             safe_rename/4,
             regex_run/3,
-            regex_compile/1
+            regex_compile/1,
+            regex_compile/2
         ]).
 
 -define(WRITE_OPS, [binary, raw, read, write]).
@@ -68,7 +69,12 @@ regex_run(Subject, CompiledPCRE, Opts) ->
 
 -spec regex_compile(iodata()) -> {ok, reference()}.
 regex_compile(PlainRegex) ->
-    re2:compile(PlainRegex).
+    regex_compile(PlainRegex, re2).
+
+regex_compile(PlainRegex, re2) ->
+    re2:compile(PlainRegex);
+regex_compile(PlainRegex, pcre) ->
+    re:compile(PlainRegex).
 
 -spec magic_hash(any()) -> 0..16#FFFFFFFF.
 %% @doc 
diff --git a/test/end_to_end/iterator_SUITE.erl b/test/end_to_end/iterator_SUITE.erl
index cb7a5a4a..74e71f79 100644
--- a/test/end_to_end/iterator_SUITE.erl
+++ b/test/end_to_end/iterator_SUITE.erl
@@ -844,6 +844,7 @@ capture_and_filter_terms(_Config) ->
 
     SW0 = os:timestamp(),
     {ok, WillowLeedsPCRE} = re:compile(WillowLeedsFinder),
+        
     QueryPCRE0 =
         {index_query,
             {Bucket, null},
@@ -879,31 +880,38 @@ capture_and_filter_terms(_Config) ->
         "[^\\|]*#LS[^\\|]*",
     FilterFun1 =
         fun(Captures) ->
-            DoB = maps:get(<<"dob">>, Captures),
+            DoB = maps:get(<<"dob">>, Captures, notfound),
             (DoB >= StartDoB) andalso (DoB =< EndDoB)
         end,
-    {ok, WillowLeedsExtractorPCRE} = re:compile(WillowLeedsExtractor),
+    EvalFunPCRE =
+        leveled_eval:generate_eval_function(
+            "regex($term, :regex, pcre, ($dob))",
+            #{<<"regex">> => list_to_binary(WillowLeedsExtractor)}
+        ),
+    
     QueryPCRE1 =
         {index_query,
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {false,
-                {capture, WillowLeedsExtractorPCRE, [<<"dob">>], FilterFun1}}
+            {false, {eval, EvalFunPCRE, FilterFun1}}
     },
     {async, RunnerPCRE1} = leveled_bookie:book_returnfolder(Book1, QueryPCRE1),
     BornMid70sPCRE1 = RunnerPCRE1(),
 
     SW2 = os:timestamp(),
 
-    {ok, WillowLeedsExtractorRE2} = re2:compile(WillowLeedsExtractor),
+    EvalFunRE2 =
+        leveled_eval:generate_eval_function(
+            "regex($term, :regex, re2, ($dob))",
+            #{<<"regex">> => list_to_binary(WillowLeedsExtractor)}
+        ),
     QueryRE2_2 =
         {index_query,
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {false,
-                {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}}
+            {false, {eval, EvalFunRE2, FilterFun1}}
     },
     {async, RunnerRE2_2} = leveled_bookie:book_returnfolder(Book1, QueryRE2_2),
     BornMid70sRE2_2 = RunnerRE2_2(),
@@ -916,8 +924,7 @@ capture_and_filter_terms(_Config) ->
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {<<"dob">>,
-                {capture, WillowLeedsExtractorRE2, [<<"dob">>], AllFun}}
+            {<<"dob">>, {eval, EvalFunRE2, AllFun}}
     },
     {async, RunnerRE2_3} = leveled_bookie:book_returnfolder(Book1, QueryRE2_3),
     Results3 = RunnerRE2_3(),
@@ -939,10 +946,15 @@ capture_and_filter_terms(_Config) ->
     WillowLeedsDoubleExtractor = 
         "[^\\|]*\\|(?P<dob>[0-9]{8})\\|(?P<dod>[0-9]{0,8})\\|"
         "[^\\|]*#Willow[^\\|]*\\|[^\\|]*#LS[^\\|]*",
-    
+    EvalFunRE2_2 =
+        leveled_eval:generate_eval_function(
+            "regex($term, :regex, re2, ($dob, $dod))",
+            #{<<"regex">> => list_to_binary(WillowLeedsDoubleExtractor)}
+        ),
+        
     FilterFun2 =
         fun(Captures) ->
-            DoB = maps:get(<<"dob">>, Captures),
+            DoB = maps:get(<<"dob">>, Captures, notfound),
             (DoB >= StartDoB) andalso (DoB =< EndDoB)
         end,
     QueryRE2_4 =
@@ -950,11 +962,7 @@ capture_and_filter_terms(_Config) ->
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {false,
-                {capture,
-                    element(2, re2:compile(WillowLeedsDoubleExtractor)),
-                    [<<"dob">>, <<"dod">>],
-                    FilterFun2}}
+            {false, {eval, EvalFunRE2_2, FilterFun2}}
     },
     {async, RunnerRE2_4} = leveled_bookie:book_returnfolder(Book1, QueryRE2_4),
     BornMid70sRE2_4 = RunnerRE2_4(),
@@ -966,10 +974,10 @@ capture_and_filter_terms(_Config) ->
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {true,
-                {capture, WillowLeedsExtractorRE2, [<<"dob">>], FilterFun1}}
+            {true, {eval, EvalFunRE2, FilterFun1}}
     },
     {async, RunnerRE2_5} = leveled_bookie:book_returnfolder(Book1, QueryRE2_5),
+    {ok, WillowLeedsExtractorRE2} = re2:compile(WillowLeedsExtractor),
     BornMid70sRE2_5 =
         lists:filtermap(
             fun({T, K}) ->
@@ -995,11 +1003,7 @@ capture_and_filter_terms(_Config) ->
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {false,
-                {capture,
-                    element(2, re2:compile(WillowLeedsExtractor)),
-                    [<<"dob">>],
-                    FilterFun5}}
+            {false, {eval, EvalFunRE2, FilterFun5}}
     },
     {async, RunnerRE2_8} = leveled_bookie:book_returnfolder(Book1, QueryRE2_8),
     BornMid70sRE2_8 = RunnerRE2_8(),
@@ -1010,16 +1014,18 @@ capture_and_filter_terms(_Config) ->
         "[^\\|]*\\|(?P<dob>197[4-6]{1}[0-9]{4})\\|"
         "[0-9]{0,8}\\|[^\\|]*#Willow[^\\|]*\\|"
         "[^\\|]*#LS[^\\|]*",
+    PreFilterEvalFun =
+        leveled_eval:generate_eval_function(
+            "regex($term, :regex, re2, ($dob))",
+            #{<<"regex">> => list_to_binary(PreFilterRE)}
+        ),
+    
     QueryRE2_9 =
         {index_query,
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {false,
-                {capture,
-                    element(2, re2:compile(PreFilterRE)),
-                    [<<"dob">>],
-                    FilterFun5}}
+            {false, {eval, PreFilterEvalFun, FilterFun5}}
     },
     {async, RunnerRE2_9} = leveled_bookie:book_returnfolder(Book1, QueryRE2_9),
     BornMid70sRE2_9 = RunnerRE2_9(),
@@ -1054,10 +1060,7 @@ capture_and_filter_terms(_Config) ->
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {false,
-                {eval,
-                    EvalFun2,
-                    FilterFun6}}
+            {false, {eval, EvalFun2, FilterFun6}}
     },
     {async, RunnerRE2_10} = leveled_bookie:book_returnfolder(Book1, QueryRE2_10),
     BornMid70sRE2_10 = RunnerRE2_10(),
@@ -1100,13 +1103,13 @@ capture_and_filter_terms(_Config) ->
         "~nEval processed index with parsed filter expression took ~w ms~n",
         [timer:now_diff(SW11, SW10) div 1000]),
 
+
     QueryRE2_3_WrongCapture =
         {index_query,
             {Bucket, null},
             {fun testutil:foldkeysfun/3, []},
             {IdxName, <<"M">>, <<"Z">>},
-            {<<"gns">>,
-                {capture, WillowLeedsExtractorRE2, [<<"dob">>], AllFun}}
+            {<<"gns">>, {eval, EvalFunRE2, FilterFun6}}
         },
     {async, RunnerRE2_3_WC} =
         leveled_bookie:book_returnfolder(Book1, QueryRE2_3_WrongCapture),