github · aibaars · Feb 28, 2022 · Feb 22, 2022 · Feb 25, 2022 · Feb 25, 2022
@@ -0,0 +1,4 @@
+---
+category: minorAnalysis
+---
+* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`.
@@ -39,7 +39,12 @@ newtype TRegExpParent =
   /** A special character */
   TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
   /** A normal character */
-  TRegExpNormalChar(Regex re, int start, int end) { re.normalCharacter(start, end) } or
+  TRegExpNormalChar(Regex re, int start, int end) {
+    re.normalCharacterSequence(start, end)
+    or
+    re.escapedCharacter(start, end) and
+    not re.specialCharacter(start, end, _)
+  } or
   /** A back reference */
   TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
 

@@ -427,6 +427,7 @@ abstract class RegexString extends Expr {
   }
 
   predicate normalCharacter(int start, int end) {
+    end = start + 1 and
     this.character(start, end) and
     not this.specialCharacter(start, end, _)
   }
@@ -446,6 +447,49 @@ abstract class RegexString extends Expr {
     )
   }
 
+  /**
+   * Holds if the range [start:end) consists of only 'normal' characters.
+   */
+  predicate normalCharacterSequence(int start, int end) {
+    // a normal character inside a character set is interpreted on its own
+    this.normalCharacter(start, end) and
+    this.inCharSet(start)
+    or
+    // a maximal run of normal characters is considered as one constant
+    exists(int s, int e |
+      e = max(int i | this.normalCharacterRun(s, i)) and
+      not this.inCharSet(s)
+    |
+      // 'abc' can be considered one constant, but
+      // 'abc+' has to be broken up into 'ab' and 'c+',
+      // as the qualifier only applies to 'c'.
+      if this.qualifier(e, _, _, _)
+      then
+        end = e and start = e - 1
+        or
+        end = e - 1 and start = s and start < end
+      else (
+        end = e and
+        start = s
+      )
+    )
+  }
+
+  private predicate normalCharacterRun(int start, int end) {
+    (
+      this.normalCharacterRun(start, end - 1)
+      or
+      start = end - 1 and not this.normalCharacter(start - 1, start)
+    ) and
+    this.normalCharacter(end - 1, end)
+  }
+
+  private predicate characterItem(int start, int end) {
+    this.normalCharacterSequence(start, end) or
+    this.escapedCharacter(start, end) or
+    this.specialCharacter(start, end, _)
+  }
+
   /** Whether the text in the range start,end is a group */
   predicate group(int start, int end) {
     this.groupContents(start, end, _, _)
@@ -717,7 +761,7 @@ abstract class RegexString extends Expr {
   string getBackrefName(int start, int end) { this.named_backreference(start, end, result) }
 
   private predicate baseItem(int start, int end) {
-    this.character(start, end) and
+    this.characterItem(start, end) and
     not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
     or
     this.group(start, end)
@@ -837,14 +881,14 @@ abstract class RegexString extends Expr {
   }
 
   private predicate item_start(int start) {
-    this.character(start, _) or
+    this.characterItem(start, _) or
     this.isGroupStart(start) or
     this.charSet(start, _) or
     this.backreference(start, _)
   }
 
   private predicate item_end(int end) {
-    this.character(_, end)
+    this.characterItem(_, end)
     or
     exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
     or
@@ -953,7 +997,7 @@ abstract class RegexString extends Expr {
    */
   predicate firstItem(int start, int end) {
     (
-      this.character(start, end)
+      this.characterItem(start, end)
       or
       this.qualifiedItem(start, end, _, _)
       or
@@ -968,7 +1012,7 @@ abstract class RegexString extends Expr {
    */
   predicate lastItem(int start, int end) {
     (
-      this.character(start, end)
+      this.characterItem(start, end)
       or
       this.qualifiedItem(start, end, _, _)
       or

@@ -1,6 +1,6 @@
-| 012345678 | first | 0 | 1 |
-| 012345678 | last | 8 | 9 |
-| (?!not-this)^[A-Z_]+$ | first | 3 | 4 |
+| 012345678 | first | 0 | 9 |
+| 012345678 | last | 0 | 9 |
+| (?!not-this)^[A-Z_]+$ | first | 3 | 11 |
 | (?!not-this)^[A-Z_]+$ | first | 12 | 13 |
 | (?!not-this)^[A-Z_]+$ | first | 13 | 19 |
 | (?!not-this)^[A-Z_]+$ | first | 13 | 20 |
@@ -27,9 +27,9 @@
 | (?m)^(?!$) | last | 4 | 5 |
 | (?m)^(?!$) | last | 8 | 9 |
 | (\\033\|~{) | first | 1 | 5 |
-| (\\033\|~{) | first | 6 | 7 |
+| (\\033\|~{) | first | 6 | 8 |
 | (\\033\|~{) | last | 1 | 5 |
-| (\\033\|~{) | last | 7 | 8 |
+| (\\033\|~{) | last | 6 | 8 |
 | [\ufffd-\ufffd] | first | 0 | 5 |
 | [\ufffd-\ufffd] | last | 0 | 5 |
 | [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 |
@@ -52,8 +52,8 @@
 | \\A[+-]?\\d+ | last | 7 | 9 |
 | \\A[+-]?\\d+ | last | 7 | 10 |
 | \\Afoo\\Z | first | 0 | 2 |
-| \\Afoo\\Z | first | 2 | 3 |
-| \\Afoo\\Z | last | 4 | 5 |
+| \\Afoo\\Z | first | 2 | 5 |
+| \\Afoo\\Z | last | 2 | 5 |
 | \\Afoo\\Z | last | 5 | 7 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
 | \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
@@ -86,30 +86,30 @@
 | ^[A-Z_]+$(?<!not-this) | last | 1 | 7 |
 | ^[A-Z_]+$(?<!not-this) | last | 1 | 8 |
 | ^[A-Z_]+$(?<!not-this) | last | 8 | 9 |
-| ^[A-Z_]+$(?<!not-this) | last | 20 | 21 |
+| ^[A-Z_]+$(?<!not-this) | last | 13 | 21 |
 | ax{01,3} | first | 0 | 1 |
 | ax{01,3} | last | 1 | 2 |
 | ax{01,3} | last | 1 | 8 |
-| ax{01,3} | last | 7 | 8 |
+| ax{01,3} | last | 3 | 8 |
 | ax{3,} | first | 0 | 1 |
 | ax{3,} | last | 1 | 2 |
 | ax{3,} | last | 1 | 6 |
-| ax{3,} | last | 5 | 6 |
+| ax{3,} | last | 3 | 6 |
 | ax{3} | first | 0 | 1 |
 | ax{3} | last | 1 | 2 |
 | ax{3} | last | 1 | 5 |
-| ax{3} | last | 4 | 5 |
+| ax{3} | last | 3 | 5 |
 | ax{,3} | first | 0 | 1 |
 | ax{,3} | last | 0 | 1 |
 | ax{,3} | last | 1 | 2 |
 | ax{,3} | last | 1 | 6 |
-| ax{,3} | last | 5 | 6 |
+| ax{,3} | last | 3 | 6 |
 | x\| | first | 0 | 1 |
 | x\| | last | 0 | 1 |
 | x\|(?<!\\w)l | first | 0 | 1 |
 | x\|(?<!\\w)l | first | 6 | 8 |
 | x\|(?<!\\w)l | first | 9 | 10 |
 | x\|(?<!\\w)l | last | 0 | 1 |
 | x\|(?<!\\w)l | last | 9 | 10 |
-| x{Not qual} | first | 0 | 1 |
-| x{Not qual} | last | 10 | 11 |
+| x{Not qual} | first | 0 | 11 |
+| x{Not qual} | last | 0 | 11 |
@@ -6,6 +6,10 @@ predicate part(Regex r, int start, int end, string kind) {
   or
   r.normalCharacter(start, end) and kind = "char"
   or
+  r.escapedCharacter(start, end) and
+  kind = "char" and
+  not r.specialCharacter(start, end, _)
+  or
   r.specialCharacter(start, end, kind)
   or
   r.sequence(start, end) and kind = "sequence"

@@ -59,7 +59,7 @@
 | redos.py:220:25:220:29 | [^X]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'W'. |
 | redos.py:223:30:223:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
 | redos.py:229:30:229:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
-| redos.py:241:27:241:27 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ba'. |
+| redos.py:241:26:241:27 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. |
 | redos.py:247:25:247:31 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
 | redos.py:256:25:256:27 | \\w* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
 | redos.py:256:37:256:39 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbaz' and containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |

@@ -0,0 +1,4 @@
+---
+category: minorAnalysis
+---
+* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`.
@@ -382,6 +382,7 @@ class RegExp extends AST::RegExpLiteral {
   }
 
   predicate normalCharacter(int start, int end) {
+    end = start + 1 and
     this.character(start, end) and
     not this.specialCharacter(start, end, _)
   }
@@ -401,6 +402,49 @@ class RegExp extends AST::RegExpLiteral {
     )
   }
 
+  /**
+   * Holds if the range [start:end) consists of only 'normal' characters.
+   */
+  predicate normalCharacterSequence(int start, int end) {
+    // a normal character inside a character set is interpreted on its own
+    this.normalCharacter(start, end) and
+    this.inCharSet(start)
+    or
+    // a maximal run of normal characters is considered as one constant
+    exists(int s, int e |
+      e = max(int i | this.normalCharacterRun(s, i)) and
+      not this.inCharSet(s)
+    |
+      // 'abc' can be considered one constant, but
+      // 'abc+' has to be broken up into 'ab' and 'c+',
+      // as the qualifier only applies to 'c'.
+      if this.qualifier(e, _, _, _)
+      then
+        end = e and start = e - 1
+        or
+        end = e - 1 and start = s and start < end
+      else (
+        end = e and
+        start = s
+      )
+    )
+  }
+
+  private predicate normalCharacterRun(int start, int end) {
+    (
+      this.normalCharacterRun(start, end - 1)
+      or
+      start = end - 1 and not this.normalCharacter(start - 1, start)
+    ) and
+    this.normalCharacter(end - 1, end)
+  }
+
+  private predicate characterItem(int start, int end) {
+    this.normalCharacterSequence(start, end) or
+    this.escapedCharacter(start, end) or
+    this.specialCharacter(start, end, _)
+  }
+
   /** Whether the text in the range `start,end` is a group */
   predicate group(int start, int end) {
     this.groupContents(start, end, _, _)
@@ -639,7 +683,7 @@ class RegExp extends AST::RegExpLiteral {
   string getBackRefName(int start, int end) { this.namedBackreference(start, end, result) }
 
   private predicate baseItem(int start, int end) {
-    this.character(start, end) and
+    this.characterItem(start, end) and
     not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
     or
     this.group(start, end)
@@ -746,15 +790,15 @@ class RegExp extends AST::RegExpLiteral {
   }
 
   private predicate itemStart(int start) {
-    this.character(start, _) or
+    this.characterItem(start, _) or
     this.isGroupStart(start) or
     this.charSet(start, _) or
     this.backreference(start, _) or
     this.namedCharacterProperty(start, _, _)
   }
 
   private predicate itemEnd(int end) {
-    this.character(_, end)
+    this.characterItem(_, end)
     or
     exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
     or
@@ -865,7 +909,7 @@ class RegExp extends AST::RegExpLiteral {
    */
   predicate firstItem(int start, int end) {
     (
-      this.character(start, end)
+      this.characterItem(start, end)
       or
       this.qualifiedItem(start, end, _, _)
       or
@@ -880,7 +924,7 @@ class RegExp extends AST::RegExpLiteral {
    */
   predicate lastItem(int start, int end) {
     (
-      this.character(start, end)
+      this.characterItem(start, end)
       or
       this.qualifiedItem(start, end, _, _)
       or

@@ -228,7 +228,12 @@ newtype TRegExpParent =
   TRegExpCharacterRange(RegExp re, int start, int end) { re.charRange(_, start, _, _, end) } or
   TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or
   TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or
-  TRegExpNormalChar(RegExp re, int start, int end) { re.normalCharacter(start, end) } or
+  TRegExpNormalChar(RegExp re, int start, int end) {
+    re.normalCharacterSequence(start, end)
+    or
+    re.escapedCharacter(start, end) and
+    not re.specialCharacter(start, end, _)
+  } or
   TRegExpBackRef(RegExp re, int start, int end) { re.backreference(start, end) } or
   TRegExpNamedCharacterProperty(RegExp re, int start, int end) {
     re.namedCharacterProperty(start, end, _)