diff --git a/pest_derive/src/ast.rs b/pest_derive/src/ast.rs index 12d2db5e..81404bf6 100644 --- a/pest_derive/src/ast.rs +++ b/pest_derive/src/ast.rs @@ -36,6 +36,9 @@ pub enum Expr { Opt(Box), Rep(Box), RepOnce(Box), + RepExact(Box, u32), + RepMin(Box, u32), + RepMax(Box, u32), RepMinMax(Box, u32, u32), Push(Box) } @@ -73,6 +76,18 @@ impl Expr { let mapped = Box::new(map_internal(*expr, f)); Expr::RepOnce(mapped) } + Expr::RepExact(expr, max) => { + let mapped = Box::new(map_internal(*expr, f)); + Expr::RepExact(mapped, max) + } + Expr::RepMin(expr, num) => { + let mapped = Box::new(map_internal(*expr, f)); + Expr::RepMin(mapped, num) + } + Expr::RepMax(expr, num) => { + let mapped = Box::new(map_internal(*expr, f)); + Expr::RepMax(mapped, num) + } Expr::RepMinMax(expr, min, max) => { let mapped = Box::new(map_internal(*expr, f)); Expr::RepMinMax(mapped, min, max) @@ -122,6 +137,18 @@ impl Expr { let mapped = Box::new(map_internal(*expr, f)); Expr::RepOnce(mapped) } + Expr::RepExact(expr, num) => { + let mapped = Box::new(map_internal(*expr, f)); + Expr::RepExact(mapped, num) + } + Expr::RepMin(expr, max) => { + let mapped = Box::new(map_internal(*expr, f)); + Expr::RepMin(mapped, max) + } + Expr::RepMax(expr, max) => { + let mapped = Box::new(map_internal(*expr, f)); + Expr::RepMax(mapped, max) + } Expr::RepMinMax(expr, min, max) => { let mapped = Box::new(map_internal(*expr, f)); Expr::RepMinMax(mapped, min, max) diff --git a/pest_derive/src/lib.rs b/pest_derive/src/lib.rs index 77397dbb..3ec7fd0a 100644 --- a/pest_derive/src/lib.rs +++ b/pest_derive/src/lib.rs @@ -152,6 +152,8 @@ //! | `e*` | matches `e` zero or more times | //! | `e+` | matches `e` one or more times | //! | `e{n}` | matches `e` exactly `n` times | +//! | `e{, n}` | matches `e` at most `n` times | +//! | `e{n,} ` | matches `e` at least `n` times | //! | `e{m, n}` | matches `e` between `m` and `n` times inclusively | //! | `e?` | optionally matches `e` | //! | `&e` | matches `e` without making progress | @@ -226,8 +228,12 @@ #![doc(html_root_url = "https://docs.rs/pest_derive")] #![recursion_limit="256"] +#[cfg(test)] #[macro_use] extern crate pest; +#[cfg(not(test))] +extern crate pest; + extern crate proc_macro; #[macro_use] extern crate quote; diff --git a/pest_derive/src/optimizer.rs b/pest_derive/src/optimizer.rs index ac5cdaf1..a294ae1c 100644 --- a/pest_derive/src/optimizer.rs +++ b/pest_derive/src/optimizer.rs @@ -48,6 +48,70 @@ pub fn optimize(rules: Vec) -> Vec { Box::new(Expr::Rep(expr)) ) } + Expr::RepExact(expr, num) => { + (1..num + 1).map(|_| { + *expr.clone() + }) + .rev() + .fold(None, |rep, expr| { + match rep { + None => Some(expr), + Some(rep) => { + Some( + Expr::Seq( + Box::new(expr), + Box::new(rep) + ) + ) + } + } + }) + .unwrap() + } + Expr::RepMin(expr, min) => { + (1..min + 2).map(|i| { + if i <= min { + *expr.clone() + } else { + Expr::Rep(expr.clone()) + } + }) + .rev() + .fold(None, |rep, expr| { + match rep { + None => Some(expr), + Some(rep) => { + Some( + Expr::Seq( + Box::new(expr), + Box::new(rep) + ) + ) + } + } + }) + .unwrap() + } + Expr::RepMax(expr, max) => { + (1..max + 1).map(|_| { + Expr::Opt(expr.clone()) + }) + .rev() + .fold(None, |rep, expr| { + match rep { + None => Some(expr), + Some(rep) => { + Some( + Expr::Seq( + Box::new(expr), + Box::new(rep) + ) + ) + } + } + }) + .unwrap() + } Expr::RepMinMax(expr, min, max) => { (1..max + 1).map(|i| { if i <= min { @@ -170,6 +234,144 @@ mod tests { assert_eq!(optimize(rules), concatenated); } + #[test] + fn unroll_loop_exact() { + let rules = vec![ + Rule { + name: Ident::new("rule"), + ty: RuleType::Atomic, + expr: Expr::RepExact( + Box::new(Expr::Ident(Ident::new("a"))), + 3 + ) + } + ]; + let unrolled = vec![ + Rule { + name: Ident::new("rule"), + ty: RuleType::Atomic, + expr: Expr::Seq( + Box::new(Expr::Ident(Ident::new("a"))), + Box::new(Expr::Seq( + Box::new(Expr::Ident(Ident::new("a"))), + Box::new(Expr::Ident(Ident::new("a"))) + )) + ) + } + ]; + + assert_eq!(optimize(rules), unrolled); + } + + + #[test] + fn unroll_loop_max() { + let rules = vec![ + Rule { + name: Ident::new("rule"), + ty: RuleType::Atomic, + expr: Expr::RepMax( + Box::new(Expr::Str("a".to_owned())), + 3 + ) + } + ]; + let unrolled = vec![ + Rule { + name: Ident::new("rule"), + ty: RuleType::Atomic, + expr: Expr::Seq( + Box::new(Expr::Opt( + Box::new(Expr::Str("a".to_owned())) + )), + Box::new(Expr::Seq( + Box::new(Expr::Opt( + Box::new(Expr::Str("a".to_owned())) + )), + Box::new(Expr::Opt( + Box::new(Expr::Str("a".to_owned())) + )) + )) + ) + } + ]; + + assert_eq!(optimize(rules), unrolled); + } + + #[test] + fn unroll_loop_min() { + let rules = vec![ + Rule { + name: Ident::new("rule"), + ty: RuleType::Atomic, + expr: Expr::RepMin( + Box::new(Expr::Str("a".to_owned())), + 2 + ) + } + ]; + let unrolled = vec![ + Rule { + name: Ident::new("rule"), + ty: RuleType::Atomic, + expr: Expr::Seq( + Box::new(Expr::Str("a".to_owned())), + Box::new(Expr::Seq( + Box::new(Expr::Str("a".to_owned())), + Box::new(Expr::Rep( + Box::new(Expr::Str("a".to_owned())) + )) + )) + ) + } + ]; + + assert_eq!(optimize(rules), unrolled); + } + + #[test] + fn unroll_loop_min_max() { + let rules = vec![ + Rule { + name: Ident::new("rule"), + ty: RuleType::Atomic, + expr: Expr::RepMinMax( + Box::new(Expr::Str("a".to_owned())), + 2, + 3 + ) + } + ]; + let unrolled = vec![ + Rule { + name: Ident::new("rule"), + ty: RuleType::Atomic, + expr: Expr::Seq( + /* TODO possible room for improvement here: + * if the sequences were rolled out in the opposite + * order, we could further optimize the strings + * in cases like this. + Box::new(Expr::Str("aa".to_owned())), + Box::new(Expr::Opt( + Box::new(Expr::Str("a".to_owned())) + )) + */ + Box::new(Expr::Str("a".to_owned())), + Box::new(Expr::Seq( + Box::new(Expr::Str("a".to_owned())), + Box::new(Expr::Opt( + Box::new(Expr::Str("a".to_owned())), + )) + )) + ) + } + ]; + + assert_eq!(optimize(rules), unrolled); + } + + #[test] fn concat_insensitive_strings() { let rules = vec![ diff --git a/pest_derive/src/parser.rs b/pest_derive/src/parser.rs index 439640a8..033a7e6b 100644 --- a/pest_derive/src/parser.rs +++ b/pest_derive/src/parser.rs @@ -43,6 +43,9 @@ pub enum GrammarRule { optional_operator, repeat_operator, repeat_once_operator, + repeat_exact, + repeat_min, + repeat_max, repeat_min_max, comma, push, @@ -345,6 +348,12 @@ impl Parser for GrammarParser { repeat_operator(pos, state) }).or_else(|pos| { repeat_once_operator(pos, state) + }).or_else(|pos| { + repeat_exact(pos, state) + }).or_else(|pos| { + repeat_min(pos, state) + }).or_else(|pos| { + repeat_max(pos, state) }).or_else(|pos| { repeat_min_max(pos, state) }) @@ -413,6 +422,77 @@ impl Parser for GrammarParser { }) } + fn repeat_exact( + pos: Position, + state: &mut ParserState + ) -> Result, Position> { + state.rule(GrammarRule::repeat_exact, pos, |state, pos| { + state.sequence(move |state| { + pos.sequence(|pos| { + opening_brace(pos, state).and_then(|pos| { + skip(pos, state) + }).and_then(|pos| { + number(pos, state) + }).and_then(|pos| { + skip(pos, state) + }).and_then(|pos| { + closing_brace(pos, state) + }) + }) + }) + }) + } + + fn repeat_min( + pos: Position, + state: &mut ParserState + ) -> Result, Position> { + state.rule(GrammarRule::repeat_min, pos, |state, pos| { + state.sequence(move |state| { + pos.sequence(|pos| { + opening_brace(pos, state).and_then(|pos| { + skip(pos, state) + }).and_then(|pos| { + number(pos, state) + }).and_then(|pos| { + skip(pos, state) + }).and_then(|pos| { + comma(pos, state) + }).and_then(|pos| { + skip(pos, state) + }).and_then(|pos| { + closing_brace(pos, state) + }) + }) + }) + }) + } + + fn repeat_max( + pos: Position, + state: &mut ParserState + ) -> Result, Position> { + state.rule(GrammarRule::repeat_max, pos, |state, pos| { + state.sequence(move |state| { + pos.sequence(|pos| { + opening_brace(pos, state).and_then(|pos| { + skip(pos, state) + }).and_then(|pos| { + comma(pos, state) + }).and_then(|pos| { + skip(pos, state) + }).and_then(|pos| { + number(pos, state) + }).and_then(|pos| { + skip(pos, state) + }).and_then(|pos| { + closing_brace(pos, state) + }) + }) + }) + }) + } + fn repeat_min_max( pos: Position, state: &mut ParserState @@ -427,17 +507,13 @@ impl Parser for GrammarParser { }).and_then(|pos| { skip(pos, state) }).and_then(|pos| { - pos.optional(|pos| { - state.sequence(move |state| { - pos.sequence(|pos| { - comma(pos, state).and_then(|pos| { - skip(pos, state) - }).and_then(|pos| { - number(pos, state) - }) - }) - }) - }) + comma(pos, state) + }).and_then(|pos| { + skip(pos, state) + }).and_then(|pos| { + number(pos, state) + }).and_then(|pos| { + skip(pos, state) }).and_then(|pos| { closing_brace(pos, state) }) @@ -820,6 +896,9 @@ impl Parser for GrammarParser { GrammarRule::optional_operator => optional_operator(pos, &mut state), GrammarRule::repeat_operator => repeat_operator(pos, &mut state), GrammarRule::repeat_once_operator => repeat_once_operator(pos, &mut state), + GrammarRule::repeat_exact => repeat_exact(pos, &mut state), + GrammarRule::repeat_min=> repeat_min(pos, &mut state), + GrammarRule::repeat_max => repeat_max(pos, &mut state), GrammarRule::repeat_min_max => repeat_min_max(pos, &mut state), GrammarRule::comma => comma(pos, &mut state), GrammarRule::push => push(pos, &mut state), @@ -864,6 +943,9 @@ pub enum ParserExpr { Opt(Box>), Rep(Box>), RepOnce(Box>), + RepExact(Box>, u32), + RepMin(Box>, u32), + RepMax(Box>, u32), RepMinMax(Box>, u32, u32), Push(Box>) } @@ -897,6 +979,18 @@ fn convert_node(node: ParserNode) -> Expr { ParserExpr::Opt(node) => Expr::Opt(Box::new(convert_node(*node))), ParserExpr::Rep(node) => Expr::Rep(Box::new(convert_node(*node))), ParserExpr::RepOnce(node) => Expr::RepOnce(Box::new(convert_node(*node))), + ParserExpr::RepExact(node,num) => Expr::RepExact( + Box::new(convert_node(*node)), + num + ), + ParserExpr::RepMin(node,max) => Expr::RepMin( + Box::new(convert_node(*node)), + max + ), + ParserExpr::RepMax(node,max) => Expr::RepMax( + Box::new(convert_node(*node)), + max + ), ParserExpr::RepMinMax(node, min, max) => Expr::RepMinMax( Box::new(convert_node(*node)), min, @@ -1071,7 +1165,7 @@ fn consume_expr( span: start.span(pair.into_span().end_pos()) } } - GrammarRule::repeat_min_max => { + GrammarRule::repeat_exact => { let overflow = |span| { let error: Error<(), _> = Error::CustomErrorSpan { message: "number cannot overflow u32".to_owned(), @@ -1086,31 +1180,97 @@ fn consume_expr( inner.next().unwrap(); // opening_brace let number = inner.next().unwrap(); - let min: u32 = number.as_str() + let num: u32 = number.as_str() .parse() .expect(&overflow(number.into_span())); - match inner.next().unwrap().as_rule() { - GrammarRule::comma => { - let number = inner.next().unwrap(); - let max: u32 = number.as_str() - .parse() - .expect(&overflow(number.into_span())); - - let start = node.span.start_pos(); - ParserNode { - expr: ParserExpr::RepMinMax(Box::new(node), min, max), - span: start.span(pair.into_span().end_pos()) - } - } - GrammarRule::closing_brace => { - let start = node.span.start_pos(); - ParserNode { - expr: ParserExpr::RepMinMax(Box::new(node), min, min), - span: start.span(pair.into_span().end_pos()) - } - } - _ => unreachable!() + let start = node.span.start_pos(); + ParserNode { + expr: ParserExpr::RepExact(Box::new(node), num), + span: start.span(pair.into_span().end_pos()) + } + } + GrammarRule::repeat_min => { + let overflow = |span| { + let error: Error<(), _> = Error::CustomErrorSpan { + message: "number cannot overflow u32".to_owned(), + span + }; + + format!("parsing error\n\n{}", error) + }; + + let mut inner = pair.clone().into_inner(); + + inner.next().unwrap(); // opening_brace + + let min_number = inner.next().unwrap(); + let min: u32 = min_number.as_str() + .parse() + .expect(&overflow(min_number.into_span())); + + let start = node.span.start_pos(); + ParserNode { + expr: ParserExpr::RepMin(Box::new(node), min), + span: start.span(pair.into_span().end_pos()) + } + } + GrammarRule::repeat_max => { + let overflow = |span| { + let error: Error<(), _> = Error::CustomErrorSpan { + message: "number cannot overflow u32".to_owned(), + span + }; + + format!("parsing error\n\n{}", error) + }; + + let mut inner = pair.clone().into_inner(); + + inner.next().unwrap(); // opening_brace + inner.next().unwrap(); // comma + + let max_number = inner.next().unwrap(); + let max: u32 = max_number.as_str() + .parse() + .expect(&overflow(max_number.into_span())); + + let start = node.span.start_pos(); + ParserNode { + expr: ParserExpr::RepMax(Box::new(node), max), + span: start.span(pair.into_span().end_pos()) + } + } + GrammarRule::repeat_min_max => { + let overflow = |span| { + let error: Error<(), _> = Error::CustomErrorSpan { + message: "number cannot overflow u32".to_owned(), + span + }; + + format!("parsing error\n\n{}", error) + }; + + let mut inner = pair.clone().into_inner(); + + inner.next().unwrap(); // opening_brace + + let min_number = inner.next().unwrap(); + let min: u32 = min_number.as_str() + .parse() + .expect(&overflow(min_number.into_span())); + + inner.next().unwrap(); // comma + + let max_number = inner.next().unwrap(); + let max: u32 = max_number.as_str() + .parse() + .expect(&overflow(max_number.into_span())); + + let start = node.span.start_pos(); + ParserNode { + expr: ParserExpr::RepMinMax(Box::new(node), min, max), + span: start.span(pair.into_span().end_pos()) } } GrammarRule::closing_paren => { @@ -1280,34 +1440,68 @@ mod tests { } #[test] - fn repeat_min_max() { + fn repeat_exact() { parses_to! { parser: GrammarParser, - input: "{1, 2}", - rule: GrammarRule::repeat_min_max, + input: "{1}", + rule: GrammarRule::repeat_exact, tokens: [ - repeat_min_max(0, 6, [ + repeat_exact(0, 3, [ opening_brace(0, 1), number(1, 2), - comma(2, 3), - number(4, 5), - closing_brace(5, 6) + closing_brace(2, 3) ]) ] }; } #[test] - fn repeat_exact() { + fn repeat_min() { parses_to! { parser: GrammarParser, - input: "{1}", + input: "{2,}", + rule: GrammarRule::repeat_min, + tokens: [ + repeat_min(0, 4, [ + opening_brace(0,1), + number(1,2), + comma(2,3), + closing_brace(3,4) + ]) + ] + } + } + + #[test] + fn repeat_max() { + parses_to! { + parser: GrammarParser, + input: "{, 3}", + rule: GrammarRule::repeat_max, + tokens: [ + repeat_max(0, 5, [ + opening_brace(0,1), + comma(1,2), + number(3,4), + closing_brace(4,5) + ]) + ] + } + } + + #[test] + fn repeat_min_max() { + parses_to! { + parser: GrammarParser, + input: "{1, 2}", rule: GrammarRule::repeat_min_max, tokens: [ - repeat_min_max(0, 3, [ + repeat_min_max(0, 6, [ opening_brace(0, 1), number(1, 2), - closing_brace(2, 3) + comma(2, 3), + number(4, 5), + closing_brace(5, 6) ]) ] }; @@ -1518,7 +1712,16 @@ mod tests { parser: GrammarParser, input: "a = { b ~ }", rule: GrammarRule::grammar_rules, - positives: vec![GrammarRule::term], + positives: vec![ + GrammarRule::opening_paren, + GrammarRule::positive_predicate_operator, + GrammarRule::negative_predicate_operator, + GrammarRule::push, + GrammarRule::identifier, + GrammarRule::quote, + GrammarRule::insensitive_string, + GrammarRule::single_quote + ], negatives: vec![], pos: 10 }; @@ -1655,7 +1858,7 @@ mod tests { #[test] fn ast() { - let input = "rule = _{ a{1} ~ \"b\"{1, 2} | !(^\"c\" | push('d'..'e'))?* }"; + let input = "rule = _{ a{1} ~ \"a\"{3,} ~ b{, 2} ~ \"b\"{1, 2} | !(^\"c\" | push('d'..'e'))?* }"; let pairs = GrammarParser::parse_str(GrammarRule::grammar_rules, input).unwrap(); let ast = consume_rules_with_spans(pairs); @@ -1667,10 +1870,21 @@ mod tests { ty: RuleType::Silent, expr: Expr::Choice( Box::new(Expr::Seq( - Box::new(Expr::RepMinMax( - Box::new(Expr::Ident(Ident::new("a"))), - 1, - 1 + Box::new(Expr::Seq( + Box::new(Expr::Seq( + Box::new(Expr::RepExact( + Box::new(Expr::Ident(Ident::new("a"))), + 1 + )), + Box::new(Expr::RepMin( + Box::new(Expr::Str("a".to_owned())), + 3 + )), + )), + Box::new(Expr::RepMax( + Box::new(Expr::Ident(Ident::new("b"))), + 2 + )), )), Box::new(Expr::RepMinMax( Box::new(Expr::Str("b".to_owned())), diff --git a/pest_derive/tests/grammar.pest b/pest_derive/tests/grammar.pest index a6801d65..863f2bfe 100644 --- a/pest_derive/tests/grammar.pest +++ b/pest_derive/tests/grammar.pest @@ -28,6 +28,10 @@ repeat_once_atomic = @{ string+ } repeat_min_max = { string{2, 3} } repeat_min_max_atomic = @{ string{2, 3} } repeat_exact = { string{2} } +repeat_min = { string{2,} } +repeat_min_atomic = @{ string{2,} } +repeat_max = { string{, 2} } +repeat_max_atomic = @{ string{, 2} } peek_ = { push(range) ~ push(range) ~ peek ~ peek } pop_ = { push(range) ~ push(range) ~ pop ~ pop } pop_fail = { push(range) ~ !pop ~ range ~ pop } diff --git a/pest_derive/tests/grammar.rs b/pest_derive/tests/grammar.rs index 0bdc3f15..4fdfc77b 100644 --- a/pest_derive/tests/grammar.rs +++ b/pest_derive/tests/grammar.rs @@ -473,6 +473,180 @@ fn repeat_exact() { }; } +#[test] +#[should_panic] +fn repeat_min_once() { + parses_to! { + parser: GrammarParser, + input: "abc", + rule: Rule::repeat_min, + tokens: [] + }; +} + +#[test] +fn repeat_min_twice() { + parses_to! { + parser: GrammarParser, + input: "abc abc", + rule: Rule::repeat_min, + tokens: [ + repeat_min(0, 7, [ + string(0, 3), + string(4, 7) + ]) + ] + }; +} + +#[test] +fn repeat_min_thrice() { + parses_to! { + parser: GrammarParser, + input: "abc abc abc", + rule: Rule::repeat_min, + tokens: [ + repeat_min(0, 12, [ + string(0, 3), + string(4, 7), + string(9, 12) + ]) + ] + }; +} + +#[test] +#[should_panic] +fn repeat_min_atomic_once() { + parses_to! { + parser: GrammarParser, + input: "abc", + rule: Rule::repeat_min_atomic, + tokens: [] + }; +} + +#[test] +fn repeat_min_atomic_twice() { + parses_to! { + parser: GrammarParser, + input: "abcabc", + rule: Rule::repeat_min_atomic, + tokens: [ + repeat_min_atomic(0, 6) + ] + }; +} + +#[test] +fn repeat_min_atomic_thrice() { + parses_to! { + parser: GrammarParser, + input: "abcabcabc", + rule: Rule::repeat_min_atomic, + tokens: [ + repeat_min_atomic(0, 9) + ] + }; +} + +#[test] +#[should_panic] +fn repeat_min_atomic_space() { + parses_to! { + parser: GrammarParser, + input: "abc abc", + rule: Rule::repeat_min_atomic, + tokens: [] + }; +} + +#[test] +fn repeat_max_once() { + parses_to! { + parser: GrammarParser, + input: "abc", + rule: Rule::repeat_max, + tokens: [ + repeat_max(0, 3, [ + string(0, 3) + ]) + ] + }; +} + +#[test] +fn repeat_max_twice() { + parses_to! { + parser: GrammarParser, + input: "abc abc", + rule: Rule::repeat_max, + tokens: [ + repeat_max(0, 7, [ + string(0, 3), + string(4, 7) + ]) + ] + }; +} + +#[test] +#[should_panic] +fn repeat_max_thrice() { + parses_to! { + parser: GrammarParser, + input: "abc abc", + rule: Rule::repeat_max, + tokens: [] + }; +} + +#[test] +fn repeat_max_atomic_once() { + parses_to! { + parser: GrammarParser, + input: "abc", + rule: Rule::repeat_max_atomic, + tokens: [ + repeat_max_atomic(0, 3) + ] + }; +} + +#[test] +fn repeat_max_atomic_twice() { + parses_to! { + parser: GrammarParser, + input: "abcabc", + rule: Rule::repeat_max_atomic, + tokens: [ + repeat_max_atomic(0, 6) + ] + }; +} + +#[test] +#[should_panic] +fn repeat_max_atomic_thrice() { + parses_to! { + parser: GrammarParser, + input: "abcabcabc", + rule: Rule::repeat_max_atomic, + tokens: [] + }; +} + +#[test] +#[should_panic] +fn repeat_max_atomic_space() { + parses_to! { + parser: GrammarParser, + input: "abc abc", + rule: Rule::repeat_max_atomic, + tokens: [] + }; +} + #[test] fn repeat_comment() { parses_to! {