Skip to content

Ruby/Python: regex parser: group sequences of 'normal' characters #8166

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
category: minorAnalysis
---
* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`.
7 changes: 6 additions & 1 deletion python/ql/lib/semmle/python/RegexTreeView.qll
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,12 @@ newtype TRegExpParent =
/** A special character */
TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
/** A normal character */
TRegExpNormalChar(Regex re, int start, int end) { re.normalCharacter(start, end) } or
TRegExpNormalChar(Regex re, int start, int end) {
re.normalCharacterSequence(start, end)
or
re.escapedCharacter(start, end) and
not re.specialCharacter(start, end, _)
} or
/** A back reference */
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }

Expand Down
54 changes: 49 additions & 5 deletions python/ql/lib/semmle/python/regex.qll
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,7 @@ abstract class RegexString extends Expr {
}

predicate normalCharacter(int start, int end) {
end = start + 1 and
this.character(start, end) and
not this.specialCharacter(start, end, _)
}
Expand All @@ -446,6 +447,49 @@ abstract class RegexString extends Expr {
)
}

/**
* Holds if the range [start:end) consists of only 'normal' characters.
*/
predicate normalCharacterSequence(int start, int end) {
// a normal character inside a character set is interpreted on its own
this.normalCharacter(start, end) and
this.inCharSet(start)
or
// a maximal run of normal characters is considered as one constant
exists(int s, int e |
e = max(int i | this.normalCharacterRun(s, i)) and
not this.inCharSet(s)
|
// 'abc' can be considered one constant, but
// 'abc+' has to be broken up into 'ab' and 'c+',
// as the qualifier only applies to 'c'.
if this.qualifier(e, _, _, _)
then
end = e and start = e - 1
or
end = e - 1 and start = s and start < end
else (
end = e and
start = s
)
)
}

private predicate normalCharacterRun(int start, int end) {
(
this.normalCharacterRun(start, end - 1)
or
start = end - 1 and not this.normalCharacter(start - 1, start)
) and
this.normalCharacter(end - 1, end)
}

private predicate characterItem(int start, int end) {
this.normalCharacterSequence(start, end) or
this.escapedCharacter(start, end) or
this.specialCharacter(start, end, _)
}

/** Whether the text in the range start,end is a group */
predicate group(int start, int end) {
this.groupContents(start, end, _, _)
Expand Down Expand Up @@ -717,7 +761,7 @@ abstract class RegexString extends Expr {
string getBackrefName(int start, int end) { this.named_backreference(start, end, result) }

private predicate baseItem(int start, int end) {
this.character(start, end) and
this.characterItem(start, end) and
not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
or
this.group(start, end)
Expand Down Expand Up @@ -837,14 +881,14 @@ abstract class RegexString extends Expr {
}

private predicate item_start(int start) {
this.character(start, _) or
this.characterItem(start, _) or
this.isGroupStart(start) or
this.charSet(start, _) or
this.backreference(start, _)
}

private predicate item_end(int end) {
this.character(_, end)
this.characterItem(_, end)
or
exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
or
Expand Down Expand Up @@ -953,7 +997,7 @@ abstract class RegexString extends Expr {
*/
predicate firstItem(int start, int end) {
(
this.character(start, end)
this.characterItem(start, end)
or
this.qualifiedItem(start, end, _, _)
or
Expand All @@ -968,7 +1012,7 @@ abstract class RegexString extends Expr {
*/
predicate lastItem(int start, int end) {
(
this.character(start, end)
this.characterItem(start, end)
or
this.qualifiedItem(start, end, _, _)
or
Expand Down
28 changes: 14 additions & 14 deletions python/ql/test/library-tests/regex/FirstLast.expected
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
| 012345678 | first | 0 | 1 |
| 012345678 | last | 8 | 9 |
| (?!not-this)^[A-Z_]+$ | first | 3 | 4 |
| 012345678 | first | 0 | 9 |
| 012345678 | last | 0 | 9 |
| (?!not-this)^[A-Z_]+$ | first | 3 | 11 |
| (?!not-this)^[A-Z_]+$ | first | 12 | 13 |
| (?!not-this)^[A-Z_]+$ | first | 13 | 19 |
| (?!not-this)^[A-Z_]+$ | first | 13 | 20 |
Expand All @@ -27,9 +27,9 @@
| (?m)^(?!$) | last | 4 | 5 |
| (?m)^(?!$) | last | 8 | 9 |
| (\\033\|~{) | first | 1 | 5 |
| (\\033\|~{) | first | 6 | 7 |
| (\\033\|~{) | first | 6 | 8 |
| (\\033\|~{) | last | 1 | 5 |
| (\\033\|~{) | last | 7 | 8 |
| (\\033\|~{) | last | 6 | 8 |
| [\ufffd-\ufffd] | first | 0 | 5 |
| [\ufffd-\ufffd] | last | 0 | 5 |
| [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 |
Expand All @@ -52,8 +52,8 @@
| \\A[+-]?\\d+ | last | 7 | 9 |
| \\A[+-]?\\d+ | last | 7 | 10 |
| \\Afoo\\Z | first | 0 | 2 |
| \\Afoo\\Z | first | 2 | 3 |
| \\Afoo\\Z | last | 4 | 5 |
| \\Afoo\\Z | first | 2 | 5 |
| \\Afoo\\Z | last | 2 | 5 |
| \\Afoo\\Z | last | 5 | 7 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
Expand Down Expand Up @@ -86,30 +86,30 @@
| ^[A-Z_]+$(?<!not-this) | last | 1 | 7 |
| ^[A-Z_]+$(?<!not-this) | last | 1 | 8 |
| ^[A-Z_]+$(?<!not-this) | last | 8 | 9 |
| ^[A-Z_]+$(?<!not-this) | last | 20 | 21 |
| ^[A-Z_]+$(?<!not-this) | last | 13 | 21 |
| ax{01,3} | first | 0 | 1 |
| ax{01,3} | last | 1 | 2 |
| ax{01,3} | last | 1 | 8 |
| ax{01,3} | last | 7 | 8 |
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This (and others like it) looks like an existing bug. I think it is due to this line in simpleCharacter not considering qualifiers longer than one character (such as the multiples notation {n, m}).

| ax{01,3} | last | 3 | 8 |
| ax{3,} | first | 0 | 1 |
| ax{3,} | last | 1 | 2 |
| ax{3,} | last | 1 | 6 |
| ax{3,} | last | 5 | 6 |
| ax{3,} | last | 3 | 6 |
| ax{3} | first | 0 | 1 |
| ax{3} | last | 1 | 2 |
| ax{3} | last | 1 | 5 |
| ax{3} | last | 4 | 5 |
| ax{3} | last | 3 | 5 |
| ax{,3} | first | 0 | 1 |
| ax{,3} | last | 0 | 1 |
| ax{,3} | last | 1 | 2 |
| ax{,3} | last | 1 | 6 |
| ax{,3} | last | 5 | 6 |
| ax{,3} | last | 3 | 6 |
| x\| | first | 0 | 1 |
| x\| | last | 0 | 1 |
| x\|(?<!\\w)l | first | 0 | 1 |
| x\|(?<!\\w)l | first | 6 | 8 |
| x\|(?<!\\w)l | first | 9 | 10 |
| x\|(?<!\\w)l | last | 0 | 1 |
| x\|(?<!\\w)l | last | 9 | 10 |
| x{Not qual} | first | 0 | 1 |
| x{Not qual} | last | 10 | 11 |
| x{Not qual} | first | 0 | 11 |
| x{Not qual} | last | 0 | 11 |
4 changes: 4 additions & 0 deletions python/ql/test/library-tests/regex/Regex.ql
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ predicate part(Regex r, int start, int end, string kind) {
or
r.normalCharacter(start, end) and kind = "char"
or
r.escapedCharacter(start, end) and
kind = "char" and
not r.specialCharacter(start, end, _)
or
r.specialCharacter(start, end, kind)
or
r.sequence(start, end) and kind = "sequence"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
| redos.py:220:25:220:29 | [^X]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'W'. |
| redos.py:223:30:223:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
| redos.py:229:30:229:30 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
| redos.py:241:27:241:27 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ba'. |
| redos.py:241:26:241:27 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. |
| redos.py:247:25:247:31 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
| redos.py:256:25:256:27 | \\w* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
| redos.py:256:37:256:39 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbaz' and containing many repetitions of 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
category: minorAnalysis
---
* The regular expression parser now groups sequences of normal characters. This reduces the number of instances of `RegExpNormalChar`.
54 changes: 49 additions & 5 deletions ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ class RegExp extends AST::RegExpLiteral {
}

predicate normalCharacter(int start, int end) {
end = start + 1 and
this.character(start, end) and
not this.specialCharacter(start, end, _)
}
Expand All @@ -401,6 +402,49 @@ class RegExp extends AST::RegExpLiteral {
)
}

/**
* Holds if the range [start:end) consists of only 'normal' characters.
*/
predicate normalCharacterSequence(int start, int end) {
// a normal character inside a character set is interpreted on its own
this.normalCharacter(start, end) and
this.inCharSet(start)
or
// a maximal run of normal characters is considered as one constant
exists(int s, int e |
e = max(int i | this.normalCharacterRun(s, i)) and
not this.inCharSet(s)
|
// 'abc' can be considered one constant, but
// 'abc+' has to be broken up into 'ab' and 'c+',
// as the qualifier only applies to 'c'.
if this.qualifier(e, _, _, _)
then
end = e and start = e - 1
or
end = e - 1 and start = s and start < end
else (
end = e and
start = s
)
)
}

private predicate normalCharacterRun(int start, int end) {
(
this.normalCharacterRun(start, end - 1)
or
start = end - 1 and not this.normalCharacter(start - 1, start)
) and
this.normalCharacter(end - 1, end)
}

private predicate characterItem(int start, int end) {
this.normalCharacterSequence(start, end) or
this.escapedCharacter(start, end) or
this.specialCharacter(start, end, _)
}

/** Whether the text in the range `start,end` is a group */
predicate group(int start, int end) {
this.groupContents(start, end, _, _)
Expand Down Expand Up @@ -639,7 +683,7 @@ class RegExp extends AST::RegExpLiteral {
string getBackRefName(int start, int end) { this.namedBackreference(start, end, result) }

private predicate baseItem(int start, int end) {
this.character(start, end) and
this.characterItem(start, end) and
not exists(int x, int y | this.charSet(x, y) and x <= start and y >= end)
or
this.group(start, end)
Expand Down Expand Up @@ -746,15 +790,15 @@ class RegExp extends AST::RegExpLiteral {
}

private predicate itemStart(int start) {
this.character(start, _) or
this.characterItem(start, _) or
this.isGroupStart(start) or
this.charSet(start, _) or
this.backreference(start, _) or
this.namedCharacterProperty(start, _, _)
}

private predicate itemEnd(int end) {
this.character(_, end)
this.characterItem(_, end)
or
exists(int endm1 | this.isGroupEnd(endm1) and end = endm1 + 1)
or
Expand Down Expand Up @@ -865,7 +909,7 @@ class RegExp extends AST::RegExpLiteral {
*/
predicate firstItem(int start, int end) {
(
this.character(start, end)
this.characterItem(start, end)
or
this.qualifiedItem(start, end, _, _)
or
Expand All @@ -880,7 +924,7 @@ class RegExp extends AST::RegExpLiteral {
*/
predicate lastItem(int start, int end) {
(
this.character(start, end)
this.characterItem(start, end)
or
this.qualifiedItem(start, end, _, _)
or
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,12 @@ newtype TRegExpParent =
TRegExpCharacterRange(RegExp re, int start, int end) { re.charRange(_, start, _, _, end) } or
TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or
TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or
TRegExpNormalChar(RegExp re, int start, int end) { re.normalCharacter(start, end) } or
TRegExpNormalChar(RegExp re, int start, int end) {
re.normalCharacterSequence(start, end)
or
re.escapedCharacter(start, end) and
not re.specialCharacter(start, end, _)
} or
TRegExpBackRef(RegExp re, int start, int end) { re.backreference(start, end) } or
TRegExpNamedCharacterProperty(RegExp re, int start, int end) {
re.namedCharacterProperty(start, end, _)
Expand Down
Loading