From f8884f7c14f56abe81517318c634f0284c323c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Janosch=20Mu=CC=88ller?= Date: Tue, 28 Feb 2023 15:09:18 +0100 Subject: [PATCH] Catch \k<0> --- CHANGELOG.md | 3 +- lib/regexp_parser/scanner/scanner.rl | 41 ++++++++++++---------------- spec/scanner/errors_spec.rb | 10 ++++--- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74b7535..1145e50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,7 +48,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * e.g. in `/(a)(?(1)b|c(?#hello)d)e/`, the 2nd conditional branch included "e" - fixed quantifiers after comment groups being mis-assigned to that group * e.g. in `/a(?#foo){3}/` (matches 'aaa') -- fixed scanner accepting unmatched closing parentheses ')' +- fixed Scanner accepting two cases of invalid Regexp syntax + * unmatched closing parentheses (`)`) and k-backrefs with number 0 (`\k<0>`) * these are a `SyntaxError` in Ruby, so could only be passed as a String * they now raise a `Regexp::Scanner::ScannerError` - fixed some scanner errors not inheriting from `Regexp::Scanner::ScannerError` diff --git a/lib/regexp_parser/scanner/scanner.rl b/lib/regexp_parser/scanner/scanner.rl index a070db7..9fa1214 100644 --- a/lib/regexp_parser/scanner/scanner.rl +++ b/lib/regexp_parser/scanner/scanner.rl @@ -83,10 +83,9 @@ # try to treat every other group head as options group, like Ruby group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?; - group_ref = [gk]; group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*; group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*; - group_number = '-'? . [1-9] . [0-9]*; + group_number = '-'? . [0-9]+; group_level = [+\-] . [0-9]+; group_name = ('<' . group_name_id_ab? . '>') | @@ -95,15 +94,11 @@ group_named = ('?' . group_name ); - group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') | - ("'" . group_name_id_sq? . group_level? "'")); - group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') | - ("'" . group_name_id_sq? . group_level? "'")); + group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') | + ("'" . (group_name_id_sq? | group_number) . group_level? "'")); - group_number_backref = 'k' . (('<' . group_number . group_level? '>') | - ("'" . group_number . group_level? "'")); - group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') | - ("'" . ((group_number . group_level?) | '0') "'")); + group_ref = 'k' . group_ref_body; + group_call = 'g' . group_ref_body; group_type = group_atomic | group_passive | group_absence | group_named; @@ -548,35 +543,35 @@ # Group backreference, named and numbered # ------------------------------------------------------------------------ - backslash . (group_name_backref | group_number_backref) > (backslashed, 4) { + backslash . (group_ref) > (backslashed, 4) { case text = copy(data, ts, te) - when /^\\k(<>|'')/ - raise ValidationError.for(:backref, 'backreference', 'ref ID is empty') - when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/ + when /^\\k(.)[^0-9\-][^+\-]*['>]$/ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text) - when /^\\k(.)\d+\D$/ + when /^\\k(.)[1-9]\d*['>]$/ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text) - when /^\\k(.)-\d+\D$/ + when /^\\k(.)-[1-9]\d*['>]$/ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text) - when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/ + when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text) - when /^\\k(.)-?\d+[+\-]\d+\D$/ + when /^\\k(.)-?[1-9]\d*[+\-]\d+['>]$/ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text) + else + raise ValidationError.for(:backref, 'backreference', 'invalid ref ID') end }; # Group call, named and numbered # ------------------------------------------------------------------------ - backslash . (group_name_call | group_number_call) > (backslashed, 4) { + backslash . (group_call) > (backslashed, 4) { case text = copy(data, ts, te) - when /^\\g(<>|'')/ - raise ValidationError.for(:backref, 'subexpression call', 'ref ID is empty') - when /^\\g(.)[^\p{digit}+\->][^+\-]*/ + when /^\\g(.)[^0-9+\-].*['>]$/ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text) - when /^\\g(.)\d+\D$/ + when /^\\g(.)\d+['>]$/ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text) when /^\\g(.)[+-]\d+/ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text) + else + raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID') end }; diff --git a/spec/scanner/errors_spec.rb b/spec/scanner/errors_spec.rb index 063266b..5b6862f 100644 --- a/spec/scanner/errors_spec.rb +++ b/spec/scanner/errors_spec.rb @@ -62,10 +62,12 @@ include_examples 'scan error', RS::InvalidGroupOption, 'invalid neg option', '(?-foo)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid neg option', '(?-u)' include_examples 'scan error', RS::InvalidGroupOption, 'invalid neg option', '(?-mixu)' - include_examples 'scan error', RS::InvalidBackrefError, 'empty backref', '\k<>' - include_examples 'scan error', RS::InvalidBackrefError, 'empty backref', '\k\'\'' - include_examples 'scan error', RS::InvalidBackrefError, 'empty refcall', '\g<>' - include_examples 'scan error', RS::InvalidBackrefError, 'empty refcall', '\g\'\'' + include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k<>' + include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k\'\'' + include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k<0>' + include_examples 'scan error', RS::InvalidBackrefError, 'invalid backref', '\k\'0\'' + include_examples 'scan error', RS::InvalidBackrefError, 'invalid refcall', '\g<>' + include_examples 'scan error', RS::InvalidBackrefError, 'invalid refcall', '\g\'\'' include_examples 'scan error', RS::UnknownUnicodePropertyError, 'unknown property', '\p{foobar}' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class [::]', '[[::]]' include_examples 'scan error', RS::UnknownPosixClassError, 'unknown POSIX class [:^:]', '[[:^:]]'