Added custom limits feature. (#62)

peter-winter · Dec 8, 2022 · d5acf81 · d5acf81
1 parent 1251e36
commit d5acf81
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 17 deletions.
diff --git a/examples/json-parser.cpp b/examples/json-parser.cpp
@@ -238,6 +238,11 @@ constexpr nterm<js_object_element_type> js_object_element("js_object_element");
 constexpr nterm<js_array_type> js_array("js_array");
 constexpr nterm<js_array_type> js_array_elements("js_array_elements");
 
+struct custom_limits
+{
+    static const size_t state_count_cap = 45;
+    static const size_t max_sit_count_per_state_cap = 30;
+};
 
 constexpr parser js_parser(
     js_object,
@@ -274,7 +279,9 @@ constexpr parser js_parser(
             >= [](auto&& ob, skip, auto&& e) { return add_object_element(std::move(e), std::move(ob)); },
         js_object_element(js_string, ':', js_value)
             >= [](auto k, skip, auto&& v) { return to_object_element(k, std::move(v)); }
-    )
+    ),
+    use_generated_lexer{},
+    custom_limits{}
 );
 
 int main(int argc, char* argv[])

diff --git a/include/ctpg/ctpg.hpp b/include/ctpg/ctpg.hpp
@@ -1812,17 +1812,35 @@ struct use_generated_lexer
     using type = no_type;
 };
 
-template<typename Root, typename Terms, typename NTerms, typename Rules, typename LexerUsage>
+struct default_limits
+{};
+
+template<typename T, size_t SituationCount>
+struct get_limits
+{
+    static const size_t state_count_cap = T::state_count_cap;
+    static const size_t max_sit_count_per_state_cap = T::max_sit_count_per_state_cap;    
+};
+
+template<size_t SituationCount>
+struct get_limits<default_limits, SituationCount>
+{
+    static const size_t state_count_cap = SituationCount;
+    static const size_t max_sit_count_per_state_cap = SituationCount;
+};
+
+template<typename Root, typename Terms, typename NTerms, typename Rules, typename LexerUsage, typename Limits>
 class parser
 {};
 
-template<typename RootValueType, typename... Terms, typename... NTerms, typename... Rules, typename LexerUsage>
+template<typename RootValueType, typename... Terms, typename... NTerms, typename... Rules, typename LexerUsage, typename Limits>
 class parser<
     nterm<RootValueType>,
     std::tuple<Terms...>,
     std::tuple<NTerms...>,
     std::tuple<Rules...>,
-    LexerUsage
+    LexerUsage,
+    Limits
 >
 {
 private:
@@ -1836,6 +1854,16 @@ class parser<
     using lexer_type = typename LexerUsage::type;
 
 public:
+    constexpr parser(
+        root_nterm_type grammar_root,
+        term_tuple_type terms,
+        nterm_tuple_type nterms,
+        rule_tuple_type&& rules,
+        LexerUsage,
+        Limits):
+        parser(grammar_root, terms, nterms, std::move(rules))
+    {}
+
     constexpr parser(
         root_nterm_type grammar_root,
         term_tuple_type terms,
@@ -1977,8 +2005,20 @@ class parser<
         s << "PARSER" << "\n\n";
 
         s << "Parser object size: " << sizeof(*this) << "\n";
-        s << "Parser max number of states: " << max_states << "\n";
-        s << "Parser number of states: " << state_count << "\n";
+        s << "Number of states: " << state_count << "(cap: " << state_count_cap << ")\n";
+
+        size_t max_sit_count_per_state = 0;
+        for (size16_t i = 0; i < state_count; ++i)
+        {
+            size_t count = 0;
+            for (size32_t j = 0u; j < situation_address_space_size; ++j)
+            {
+                if (states[i].test(j))
+                    count++;
+            }
+            max_sit_count_per_state = std::max(max_sit_count_per_state, count);
+        }
+        s << "Max number of situations per state: " << max_sit_count_per_state  << "(cap: " << max_sit_count_per_state_cap << ")\n";
         s << "\n";
 
         s << "RULES\n\n";
@@ -2021,7 +2061,10 @@ class parser<
     static const size_t situation_size = max_rule_element_count + 1;
     static const size_t situation_address_space_size = rule_count * situation_size * term_count;
     static const size_t situation_count = (0 + ... + (Rules::n + 1)) * term_count + 2;
-    static const size_t max_states = situation_count;
+
+    static const size_t state_count_cap = get_limits<Limits, situation_count>::state_count_cap;
+    static const size_t max_sit_count_per_state_cap = get_limits<Limits, situation_count>::max_sit_count_per_state_cap;
+
     static const size_t lexer_dfa_size = generate_lexer ? (0 + ... + Terms::dfa_size) : 1;
 
     using value_variant_type = meta::unique_types_variant_t<
@@ -2107,8 +2150,8 @@ class parser<
         size8_t has_sr_conflict = 0;
     };
 
-    using lr1_parse_table = parse_table_entry[max_states][symbol_count];
-    using simple_state_table = situation_set[max_states];
+    using lr1_parse_table = parse_table_entry[state_count_cap][symbol_count];
+    using simple_state_table = situation_set[state_count_cap];
 
     struct state_analyzer
     {
@@ -2119,7 +2162,7 @@ class parser<
         using term_subset = stdex::cbitset<term_count>;
         using nterm_subset = stdex::cbitset<nterm_count>;
         using right_side_slice_subset = stdex::cbitset<situation_size * rule_count>;
-        using situation_vector = stdex::cvector<size32_t, situation_count>;
+        using situation_vector = stdex::cvector<size32_t, max_sit_count_per_state_cap>;
 
         struct state
         {
@@ -2128,7 +2171,7 @@ class parser<
             situation_vector situations_by_symbol[symbol_count] = {};
         };
 
-        using state_table = state[max_states];
+        using state_table = state[state_count_cap];
 
         constexpr bool add_situation(size16_t state_idx, size32_t sit_idx, bool to_kernel)
         {
@@ -2321,6 +2364,8 @@ class parser<
                 if (new_state_idx == uninitialized16)
                 {
                     new_state_idx = state_count++;
+                    if (state_count > state_count_cap)
+                        throw std::runtime_error("State count exceeds the cap");
                 }
 
                 entry.arg = new_state_idx;
@@ -2437,7 +2482,7 @@ class parser<
         simple_state_table& simple_states;
         lr1_parse_table& parse_table;
 
-        state states[max_states] = {};
+        state states[state_count_cap] = {};
 
         size16_t state_count = 0;
         situation_set closures_analyzed = {};
@@ -2958,10 +3003,13 @@ class parser<
 };
 
 template<typename Root, typename Terms, typename NTerms, typename Rules>
-parser(Root, Terms, NTerms, Rules&&) -> parser<Root, Terms, NTerms, Rules, use_generated_lexer>;
+parser(Root, Terms, NTerms, Rules&&) -> parser<Root, Terms, NTerms, Rules, use_generated_lexer, default_limits>;
 
 template<typename Root, typename Terms, typename NTerms, typename Rules, typename LexerUsage>
-parser(Root, Terms, NTerms, Rules&&, LexerUsage) -> parser<Root, Terms, NTerms, Rules, LexerUsage>;
+parser(Root, Terms, NTerms, Rules&&, LexerUsage) -> parser<Root, Terms, NTerms, Rules, LexerUsage, default_limits>;
+
+template<typename Root, typename Terms, typename NTerms, typename Rules, typename LexerUsage, typename Limits>
+parser(Root, Terms, NTerms, Rules&&, LexerUsage, Limits) -> parser<Root, Terms, NTerms, Rules, LexerUsage, Limits>;
 
 template<typename... Terms>
 constexpr auto terms(const Terms&... terms)

diff --git a/readme.md b/readme.md
@@ -38,6 +38,7 @@ All it needs is a C++17 compiler!
    * [Custom lexical analyzer](#custom-lexical-analyzer)
 * [Regular expressions](#regular-expressions)
 * [Diagnostics](#diagnostics)
+* [Resulting binary size](#resulting-binary-size)
 
 ## Installation
 
@@ -1266,11 +1267,12 @@ The output contains 2 sections: one for syntax analyzer starting with the word *
 
 ```
 Parser object size: <number>
-Parser max number of states: <number>
-Parser number of states: <number>
+Number of states: <number>(cap: <number>)
+Max number of situations per state: <number>(cap: <number>)
 ```
 
-The size of the parser object may easily be couple of megabytes for some complex grammars, so consider declaring the parser as a constexpr object rather than on local stack.
+The size of the parser object may be couple of megabytes for some complex grammars, so consider declaring the parser as a constexpr object rather than on local stack.
+You may also consider looking at [how to reduce the executable binary size](#resulting-binary-size).
 
 Next, there is a rule set description in form of:
 
@@ -1353,3 +1355,46 @@ STATE <nr> (unreachable)
 ```
 
 These are leftovers from the regular expression to DFA conversion, just ignore them.
+
+## Resulting binary size
+
+When creating parser for big grammar you may notice the rather big compiled executable binary size.
+This is because the LR1 table creation algorithm needs to predict various size caps of different collections (max number of states, max number of situations in state).
+It does that by 'prefer safe over perfect' approach, so it overshoots significantly most of the time.
+
+To address this print out the disgnostic message and see if this is not the case:
+
+```
+PARSER
+
+Parser object size: 509800
+Number of states: 89(cap: 2500)
+Max number of situations per state: 490(cap: 2500)
+```
+
+If you see numbers significantly lower then the caps, this is the case of an overshoot.
+
+There is a way to address it: 
+
+```c++
+
+struct custom_limits
+{
+    static const ctpg::size_t state_count_cap = 45;
+    static const ctpg::size_t max_sit_count_per_state_cap = 30;
+};
+
+constexpr parser p(
+    list,
+    terms(...),
+    nterms(...),
+    rules(...),
+    use_generated_lexer{},
+    custom_limits{}
+);
+```
+
+Using the 6th argument to ```parser``` definition provide a custom limits structure with ```state_count_cap``` and 
+```max_sit_count_per_state_cap``` defined as ```static const ctpg::size_t```.
+The values have to be bigger than what comes out of the diagnostic message. This way you set the lower caps, decreasing the binary size assuring the actual numbers 
+don't exceed the caps.