Skip to content

Commit

Permalink
Added custom limits feature. (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
peter-winter authored Dec 8, 2022
1 parent 1251e36 commit d5acf81
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 17 deletions.
9 changes: 8 additions & 1 deletion examples/json-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,11 @@ constexpr nterm<js_object_element_type> js_object_element("js_object_element");
constexpr nterm<js_array_type> js_array("js_array");
constexpr nterm<js_array_type> js_array_elements("js_array_elements");

struct custom_limits
{
static const size_t state_count_cap = 45;
static const size_t max_sit_count_per_state_cap = 30;
};

constexpr parser js_parser(
js_object,
Expand Down Expand Up @@ -274,7 +279,9 @@ constexpr parser js_parser(
>= [](auto&& ob, skip, auto&& e) { return add_object_element(std::move(e), std::move(ob)); },
js_object_element(js_string, ':', js_value)
>= [](auto k, skip, auto&& v) { return to_object_element(k, std::move(v)); }
)
),
use_generated_lexer{},
custom_limits{}
);

int main(int argc, char* argv[])
Expand Down
74 changes: 61 additions & 13 deletions include/ctpg/ctpg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1812,17 +1812,35 @@ struct use_generated_lexer
using type = no_type;
};

template<typename Root, typename Terms, typename NTerms, typename Rules, typename LexerUsage>
struct default_limits
{};

template<typename T, size_t SituationCount>
struct get_limits
{
static const size_t state_count_cap = T::state_count_cap;
static const size_t max_sit_count_per_state_cap = T::max_sit_count_per_state_cap;
};

template<size_t SituationCount>
struct get_limits<default_limits, SituationCount>
{
static const size_t state_count_cap = SituationCount;
static const size_t max_sit_count_per_state_cap = SituationCount;
};

template<typename Root, typename Terms, typename NTerms, typename Rules, typename LexerUsage, typename Limits>
class parser
{};

template<typename RootValueType, typename... Terms, typename... NTerms, typename... Rules, typename LexerUsage>
template<typename RootValueType, typename... Terms, typename... NTerms, typename... Rules, typename LexerUsage, typename Limits>
class parser<
nterm<RootValueType>,
std::tuple<Terms...>,
std::tuple<NTerms...>,
std::tuple<Rules...>,
LexerUsage
LexerUsage,
Limits
>
{
private:
Expand All @@ -1836,6 +1854,16 @@ class parser<
using lexer_type = typename LexerUsage::type;

public:
constexpr parser(
root_nterm_type grammar_root,
term_tuple_type terms,
nterm_tuple_type nterms,
rule_tuple_type&& rules,
LexerUsage,
Limits):
parser(grammar_root, terms, nterms, std::move(rules))
{}

constexpr parser(
root_nterm_type grammar_root,
term_tuple_type terms,
Expand Down Expand Up @@ -1977,8 +2005,20 @@ class parser<
s << "PARSER" << "\n\n";

s << "Parser object size: " << sizeof(*this) << "\n";
s << "Parser max number of states: " << max_states << "\n";
s << "Parser number of states: " << state_count << "\n";
s << "Number of states: " << state_count << "(cap: " << state_count_cap << ")\n";

size_t max_sit_count_per_state = 0;
for (size16_t i = 0; i < state_count; ++i)
{
size_t count = 0;
for (size32_t j = 0u; j < situation_address_space_size; ++j)
{
if (states[i].test(j))
count++;
}
max_sit_count_per_state = std::max(max_sit_count_per_state, count);
}
s << "Max number of situations per state: " << max_sit_count_per_state << "(cap: " << max_sit_count_per_state_cap << ")\n";
s << "\n";

s << "RULES\n\n";
Expand Down Expand Up @@ -2021,7 +2061,10 @@ class parser<
static const size_t situation_size = max_rule_element_count + 1;
static const size_t situation_address_space_size = rule_count * situation_size * term_count;
static const size_t situation_count = (0 + ... + (Rules::n + 1)) * term_count + 2;
static const size_t max_states = situation_count;

static const size_t state_count_cap = get_limits<Limits, situation_count>::state_count_cap;
static const size_t max_sit_count_per_state_cap = get_limits<Limits, situation_count>::max_sit_count_per_state_cap;

static const size_t lexer_dfa_size = generate_lexer ? (0 + ... + Terms::dfa_size) : 1;

using value_variant_type = meta::unique_types_variant_t<
Expand Down Expand Up @@ -2107,8 +2150,8 @@ class parser<
size8_t has_sr_conflict = 0;
};

using lr1_parse_table = parse_table_entry[max_states][symbol_count];
using simple_state_table = situation_set[max_states];
using lr1_parse_table = parse_table_entry[state_count_cap][symbol_count];
using simple_state_table = situation_set[state_count_cap];

struct state_analyzer
{
Expand All @@ -2119,7 +2162,7 @@ class parser<
using term_subset = stdex::cbitset<term_count>;
using nterm_subset = stdex::cbitset<nterm_count>;
using right_side_slice_subset = stdex::cbitset<situation_size * rule_count>;
using situation_vector = stdex::cvector<size32_t, situation_count>;
using situation_vector = stdex::cvector<size32_t, max_sit_count_per_state_cap>;

struct state
{
Expand All @@ -2128,7 +2171,7 @@ class parser<
situation_vector situations_by_symbol[symbol_count] = {};
};

using state_table = state[max_states];
using state_table = state[state_count_cap];

constexpr bool add_situation(size16_t state_idx, size32_t sit_idx, bool to_kernel)
{
Expand Down Expand Up @@ -2321,6 +2364,8 @@ class parser<
if (new_state_idx == uninitialized16)
{
new_state_idx = state_count++;
if (state_count > state_count_cap)
throw std::runtime_error("State count exceeds the cap");
}

entry.arg = new_state_idx;
Expand Down Expand Up @@ -2437,7 +2482,7 @@ class parser<
simple_state_table& simple_states;
lr1_parse_table& parse_table;

state states[max_states] = {};
state states[state_count_cap] = {};

size16_t state_count = 0;
situation_set closures_analyzed = {};
Expand Down Expand Up @@ -2958,10 +3003,13 @@ class parser<
};

template<typename Root, typename Terms, typename NTerms, typename Rules>
parser(Root, Terms, NTerms, Rules&&) -> parser<Root, Terms, NTerms, Rules, use_generated_lexer>;
parser(Root, Terms, NTerms, Rules&&) -> parser<Root, Terms, NTerms, Rules, use_generated_lexer, default_limits>;

template<typename Root, typename Terms, typename NTerms, typename Rules, typename LexerUsage>
parser(Root, Terms, NTerms, Rules&&, LexerUsage) -> parser<Root, Terms, NTerms, Rules, LexerUsage>;
parser(Root, Terms, NTerms, Rules&&, LexerUsage) -> parser<Root, Terms, NTerms, Rules, LexerUsage, default_limits>;

template<typename Root, typename Terms, typename NTerms, typename Rules, typename LexerUsage, typename Limits>
parser(Root, Terms, NTerms, Rules&&, LexerUsage, Limits) -> parser<Root, Terms, NTerms, Rules, LexerUsage, Limits>;

template<typename... Terms>
constexpr auto terms(const Terms&... terms)
Expand Down
51 changes: 48 additions & 3 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ All it needs is a C++17 compiler!
* [Custom lexical analyzer](#custom-lexical-analyzer)
* [Regular expressions](#regular-expressions)
* [Diagnostics](#diagnostics)
* [Resulting binary size](#resulting-binary-size)

## Installation

Expand Down Expand Up @@ -1266,11 +1267,12 @@ The output contains 2 sections: one for syntax analyzer starting with the word *

```
Parser object size: <number>
Parser max number of states: <number>
Parser number of states: <number>
Number of states: <number>(cap: <number>)
Max number of situations per state: <number>(cap: <number>)
```

The size of the parser object may easily be couple of megabytes for some complex grammars, so consider declaring the parser as a constexpr object rather than on local stack.
The size of the parser object may be couple of megabytes for some complex grammars, so consider declaring the parser as a constexpr object rather than on local stack.
You may also consider looking at [how to reduce the executable binary size](#resulting-binary-size).

Next, there is a rule set description in form of:

Expand Down Expand Up @@ -1353,3 +1355,46 @@ STATE <nr> (unreachable)
```

These are leftovers from the regular expression to DFA conversion, just ignore them.

## Resulting binary size

When creating parser for big grammar you may notice the rather big compiled executable binary size.
This is because the LR1 table creation algorithm needs to predict various size caps of different collections (max number of states, max number of situations in state).
It does that by 'prefer safe over perfect' approach, so it overshoots significantly most of the time.

To address this print out the disgnostic message and see if this is not the case:

```
PARSER
Parser object size: 509800
Number of states: 89(cap: 2500)
Max number of situations per state: 490(cap: 2500)
```

If you see numbers significantly lower then the caps, this is the case of an overshoot.

There is a way to address it:

```c++

struct custom_limits
{
static const ctpg::size_t state_count_cap = 45;
static const ctpg::size_t max_sit_count_per_state_cap = 30;
};

constexpr parser p(
list,
terms(...),
nterms(...),
rules(...),
use_generated_lexer{},
custom_limits{}
);
```

Using the 6th argument to ```parser``` definition provide a custom limits structure with ```state_count_cap``` and
```max_sit_count_per_state_cap``` defined as ```static const ctpg::size_t```.
The values have to be bigger than what comes out of the diagnostic message. This way you set the lower caps, decreasing the binary size assuring the actual numbers
don't exceed the caps.

0 comments on commit d5acf81

Please sign in to comment.