@@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
111111 "character class difference is not supported" ;
112112const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED : & str =
113113 "character class symmetric difference is not supported" ;
114+ const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED : & str =
115+ "special word boundary assertion is unclosed or has an invalid character" ;
116+ const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED : & str =
117+ "special word boundary assertion is unrecognized" ;
118+ const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF : & str =
119+ "found start of special word boundary or repetition without an end" ;
114120
115121/// A regular expression parser.
116122///
@@ -479,12 +485,86 @@ impl<'a> Parser<'a> {
479485 'v' => special ( '\x0B' ) ,
480486 'A' => Ok ( Hir :: look ( hir:: Look :: Start ) ) ,
481487 'z' => Ok ( Hir :: look ( hir:: Look :: End ) ) ,
482- 'b' => Ok ( Hir :: look ( hir:: Look :: Word ) ) ,
488+ 'b' => {
489+ let mut hir = Hir :: look ( hir:: Look :: Word ) ;
490+ if !self . is_done ( ) && self . char ( ) == '{' {
491+ if let Some ( special) =
492+ self . maybe_parse_special_word_boundary ( ) ?
493+ {
494+ hir = special;
495+ }
496+ }
497+ Ok ( hir)
498+ }
483499 'B' => Ok ( Hir :: look ( hir:: Look :: WordNegate ) ) ,
500+ '<' => Ok ( Hir :: look ( hir:: Look :: WordStart ) ) ,
501+ '>' => Ok ( Hir :: look ( hir:: Look :: WordEnd ) ) ,
484502 _ => Err ( Error :: new ( ERR_ESCAPE_UNRECOGNIZED ) ) ,
485503 }
486504 }
487505
506+ /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
507+ /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
508+ ///
509+ /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
510+ /// if it fails it will just return `None` with no error. This is done
511+ /// because `\b{5}` is a valid expression and we want to let that be parsed
512+ /// by the existing counted repetition parsing code. (I thought about just
513+ /// invoking the counted repetition code from here, but it seemed a little
514+ /// ham-fisted.)
515+ ///
516+ /// Unlike `maybe_parse_ascii_class` though, this can return an error.
517+ /// Namely, if we definitely know it isn't a counted repetition, then we
518+ /// return an error specific to the specialty word boundaries.
519+ ///
520+ /// This assumes the parser is positioned at a `{` immediately following
521+ /// a `\b`. When `None` is returned, the parser is returned to the position
522+ /// at which it started: pointing at a `{`.
523+ ///
524+ /// The position given should correspond to the start of the `\b`.
525+ fn maybe_parse_special_word_boundary ( & self ) -> Result < Option < Hir > , Error > {
526+ assert_eq ! ( self . char ( ) , '{' ) ;
527+
528+ let is_valid_char = |c| match c {
529+ 'A' ..='Z' | 'a' ..='z' | '-' => true ,
530+ _ => false ,
531+ } ;
532+ let start = self . pos ( ) ;
533+ if !self . bump_and_bump_space ( ) {
534+ return Err ( Error :: new ( ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF ) ) ;
535+ }
536+ // This is one of the critical bits: if the first non-whitespace
537+ // character isn't in [-A-Za-z] (i.e., this can't be a special word
538+ // boundary), then we bail and let the counted repetition parser deal
539+ // with this.
540+ if !is_valid_char ( self . char ( ) ) {
541+ self . pos . set ( start) ;
542+ self . char . set ( Some ( '{' ) ) ;
543+ return Ok ( None ) ;
544+ }
545+
546+ // Now collect up our chars until we see a '}'.
547+ let mut scratch = String :: new ( ) ;
548+ while !self . is_done ( ) && is_valid_char ( self . char ( ) ) {
549+ scratch. push ( self . char ( ) ) ;
550+ self . bump_and_bump_space ( ) ;
551+ }
552+ if self . is_done ( ) || self . char ( ) != '}' {
553+ return Err ( Error :: new ( ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED ) ) ;
554+ }
555+ self . bump ( ) ;
556+ let kind = match scratch. as_str ( ) {
557+ "start" => hir:: Look :: WordStart ,
558+ "end" => hir:: Look :: WordEnd ,
559+ "start-half" => hir:: Look :: WordStartHalf ,
560+ "end-half" => hir:: Look :: WordEndHalf ,
561+ _ => {
562+ return Err ( Error :: new ( ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED ) )
563+ }
564+ } ;
565+ Ok ( Some ( Hir :: look ( kind) ) )
566+ }
567+
488568 /// Parse a hex representation of a Unicode codepoint. This handles both
489569 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
490570 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
@@ -1948,8 +2028,6 @@ bar
19482028 assert_eq ! ( ERR_UNICODE_CLASS_UNSUPPORTED , perr( r"\pL" ) ) ;
19492029 assert_eq ! ( ERR_UNICODE_CLASS_UNSUPPORTED , perr( r"\p{L}" ) ) ;
19502030 assert_eq ! ( ERR_ESCAPE_UNRECOGNIZED , perr( r"\i" ) ) ;
1951- assert_eq ! ( ERR_ESCAPE_UNRECOGNIZED , perr( r"\<" ) ) ;
1952- assert_eq ! ( ERR_ESCAPE_UNRECOGNIZED , perr( r"\>" ) ) ;
19532031 assert_eq ! ( ERR_UNCOUNTED_REP_SUB_MISSING , perr( r"?" ) ) ;
19542032 assert_eq ! ( ERR_UNCOUNTED_REP_SUB_MISSING , perr( r"*" ) ) ;
19552033 assert_eq ! ( ERR_UNCOUNTED_REP_SUB_MISSING , perr( r"+" ) ) ;
@@ -1983,6 +2061,11 @@ bar
19832061 assert_eq ! ( ERR_CLASS_INTERSECTION_UNSUPPORTED , perr( r"[a&&b]" ) ) ;
19842062 assert_eq ! ( ERR_CLASS_DIFFERENCE_UNSUPPORTED , perr( r"[a--b]" ) ) ;
19852063 assert_eq ! ( ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED , perr( r"[a~~b]" ) ) ;
2064+ assert_eq ! ( ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED , perr( r"\b{foo" ) ) ;
2065+ assert_eq ! ( ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED , perr( r"\b{foo!}" ) ) ;
2066+ assert_eq ! ( ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED , perr( r"\b{foo}" ) ) ;
2067+ assert_eq ! ( ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF , perr( r"\b{" ) ) ;
2068+ assert_eq ! ( ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF , perr( r"(?x)\b{ " ) ) ;
19862069 }
19872070
19882071 #[ test]
0 commit comments