Skip to content

Commit

Permalink
Add RANGE_PROBABLY_CONTAINS_NOT_IMPLIED_CHARACTERS warning, refactor …
Browse files Browse the repository at this point in the history
…code related to caseInsensitive option

fixes #3433
  • Loading branch information
KvanTTT committed Dec 25, 2021
1 parent 8bf1039 commit 7fa59a4
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,42 +23,22 @@ public abstract class CodePointTransitions {
* If {@code codePoint} is <= U+FFFF, returns a new {@link AtomTransition}.
* Otherwise, returns a new {@link SetTransition}.
*/
public static Transition createWithCodePoint(ATNState target, int codePoint, boolean caseInsensitive) {
return createWithCodePointRange(target, codePoint, codePoint, caseInsensitive);
public static Transition createWithCodePoint(ATNState target, int codePoint) {
return createWithCodePointRange(target, codePoint, codePoint);
}

/**
* If {@code codePointFrom} and {@code codePointTo} are both
* <= U+FFFF, returns a new {@link RangeTransition}.
* Otherwise, returns a new {@link SetTransition}.
*/
public static Transition createWithCodePointRange(
ATNState target,
int codePointFrom,
int codePointTo,
boolean caseInsensitive) {
if (caseInsensitive) {
int lowerCodePointFrom = Character.toLowerCase(codePointFrom);
int upperCodePointFrom = Character.toUpperCase(codePointFrom);
int lowerCodePointTo = Character.toLowerCase(codePointTo);
int upperCodePointTo = Character.toUpperCase(codePointTo);
if (lowerCodePointFrom == upperCodePointFrom && lowerCodePointTo == upperCodePointTo) {
return createWithCodePointRange(target, lowerCodePointFrom, lowerCodePointTo, false);
} else {
IntervalSet intervalSet = new IntervalSet();
intervalSet.add(lowerCodePointFrom, lowerCodePointTo);
intervalSet.add(upperCodePointFrom, upperCodePointTo);
return new SetTransition(target, intervalSet);
}
public static Transition createWithCodePointRange(ATNState target, int codePointFrom, int codePointTo) {
if (Character.isSupplementaryCodePoint(codePointFrom) || Character.isSupplementaryCodePoint(codePointTo)) {
return new SetTransition(target, IntervalSet.of(codePointFrom, codePointTo));
} else {
if (Character.isSupplementaryCodePoint(codePointFrom) ||
Character.isSupplementaryCodePoint(codePointTo)) {
return new SetTransition(target, IntervalSet.of(codePointFrom, codePointTo));
} else {
return codePointFrom == codePointTo
? new AtomTransition(target, codePointFrom)
: new RangeTransition(target, codePointFrom, codePointTo);
}
return codePointFrom == codePointTo
? new AtomTransition(target, codePointFrom)
: new RangeTransition(target, codePointFrom, codePointTo);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -424,16 +424,26 @@ public void testSetUp() throws Exception {
checkLexerMatches(lg, inputString, expecting);
}

@Test public void testLexerCaseInsensitiveWithNegation() throws Exception {
@Test public void testLexerCaseInsensitiveLiteralWithNegation() {
String grammar =
"lexer grammar L;\n" +
"options { caseInsensitive = true; }\n" +
"TOKEN_WITH_NOT: ~'f';\n"; // ~('f' | 'F)
"LITERAL_WITH_NOT: ~'f';\n"; // ~('f' | 'F)
execLexer("L.g4", grammar, "L", "F");

assertEquals("line 1:0 token recognition error at: 'F'\n", getParseErrors());
}

@Test public void testLexerCaseInsensitiveSetWithNegation() {
String grammar =
"lexer grammar L;\n" +
"options { caseInsensitive = true; }\n" +
"SET_WITH_NOT: ~[a-c];\n"; // ~[a-cA-C]
execLexer("L.g4", grammar, "L", "B");

assertEquals("line 1:0 token recognition error at: 'B'\n", getParseErrors());
}

@Test public void testLexerCaseInsensitiveFragments() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n" +
Expand Down Expand Up @@ -483,6 +493,18 @@ public void testSetUp() throws Exception {
checkLexerMatches(lg, inputString, expecting);
}

@Test public void testNotImpliedCharactersWithEnabledCaseInsensitiveOption() throws Exception {
LexerGrammar lg = new LexerGrammar(
"lexer grammar L;\n" +
"options { caseInsensitive = true; }\n" +
"TOKEN: ('A'..'z')+;\n"
);

// No range transformation because of mixed character case in range borders
String inputString = "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz";
checkLexerMatches(lg, inputString, "TOKEN, EOF");
}

protected void checkLexerMatches(LexerGrammar lg, String inputString, String expecting) {
ATN atn = createATN(lg, true);
CharStream input = CharStreams.fromString(inputString);
Expand Down
25 changes: 21 additions & 4 deletions tool-testsuite/test/org/antlr/v4/test/tool/TestSymbolIssues.java
Original file line number Diff line number Diff line change
Expand Up @@ -402,17 +402,20 @@ public void testLabelsForTokensWithMixedTypesLRWithoutLabels() {
testErrors(test, false);
}

@Test public void testCaseInsensitiveCharsCollision() throws Exception {
@Test public void testCaseInsensitiveCharsCollision() {
String[] test = {
"lexer grammar L;\n" +
"options { caseInsensitive = true; }\n" +
"TOKEN_RANGE: [a-fA-F0-9];\n" +
"TOKEN_RANGE_2: 'g'..'l' | 'G'..'L';\n",
"TOKEN_RANGE_2: 'g'..'l' | 'G'..'L';\n" +
"TOKEN_RANGE_3: 'm'..'q' | [M-Q];\n",

"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:3:18: chars a-f used multiple times in set [a-fA-F0-9]\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:3:18: chars A-F used multiple times in set [a-fA-F0-9]\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:13: chars g-l used multiple times in set 'g'..'l' | 'G'..'L'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:13: chars G-L used multiple times in set 'g'..'l' | 'G'..'L'\n"
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4:4:13: chars G-L used multiple times in set 'g'..'l' | 'G'..'L'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4::: chars 'M' used multiple times in set 'M'..'Q' | 'm'..'q'\n" +
"warning(" + ErrorType.CHARACTERS_COLLISION_IN_SET.code + "): L.g4::: chars 'm' used multiple times in set 'M'..'Q' | 'm'..'q'\n"
};

testErrors(test, false);
Expand Down Expand Up @@ -453,7 +456,7 @@ public void testLabelsForTokensWithMixedTypesLRWithoutLabels() {
testErrors(test, false);
}

@Test public void testIllegalModeOption() throws Exception {
@Test public void testIllegalModeOption() {
String[] test = {
"lexer grammar L;\n" +
"options { caseInsensitive = badValue; }\n" +
Expand All @@ -464,4 +467,18 @@ public void testLabelsForTokensWithMixedTypesLRWithoutLabels() {

testErrors(test, false);
}

@Test public void testNotImpliedCharacters() {
String[] test = {
"lexer grammar Test;\n" +
"TOKEN1: 'A'..'g';\n" +
"TOKEN2: [C-m];\n" +
"TOKEN3: [А-я]; // OK since range does not contain intermediate characters",

"warning(" + ErrorType.RANGE_PROBABLY_CONTAINS_NOT_IMPLIED_CHARACTERS.code + "): Test.g4:2:8: Range A..g probably contains not implied characters [\\]^_`. Both bounds should be defined in lower or UPPER case\n" +
"warning(" + ErrorType.RANGE_PROBABLY_CONTAINS_NOT_IMPLIED_CHARACTERS.code + "): Test.g4:3:8: Range C..m probably contains not implied characters [\\]^_`. Both bounds should be defined in lower or UPPER case\n"
};

testErrors(test, false);
}
}
6 changes: 3 additions & 3 deletions tool/src/org/antlr/v4/automata/ATNOptimizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ private static void optimizeSets(Grammar g, ATN atn) {
if (a != -1 && b != -1) {
for (int v = a; v <= b; v++) {
if (matchSet.contains(v)) {
// TODO: Token is missing (i.e. position in source will not be displayed).
// TODO: Token is missing (i.e. position in source is not displayed).
g.tool.errMgr.grammarError(ErrorType.CHARACTERS_COLLISION_IN_SET, g.fileName,
null,
CharSupport.getANTLRCharLiteralForChar(v),
Expand All @@ -124,11 +124,11 @@ private static void optimizeSets(Grammar g, ATN atn) {
Transition newTransition;
if (matchSet.getIntervals().size() == 1) {
if (matchSet.size() == 1) {
newTransition = CodePointTransitions.createWithCodePoint(blockEndState, matchSet.getMinElement(), false);
newTransition = CodePointTransitions.createWithCodePoint(blockEndState, matchSet.getMinElement());
}
else {
Interval matchInterval = matchSet.getIntervals().get(0);
newTransition = CodePointTransitions.createWithCodePointRange(blockEndState, matchInterval.a, matchInterval.b, false);
newTransition = CodePointTransitions.createWithCodePointRange(blockEndState, matchInterval.a, matchInterval.b);
}
}
else {
Expand Down
46 changes: 33 additions & 13 deletions tool/src/org/antlr/v4/automata/LexerATNFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import org.antlr.runtime.CommonToken;
import org.antlr.runtime.Token;
import org.antlr.runtime.tree.CommonTree;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.misc.CharSupport;
import org.antlr.v4.misc.EscapeSequenceParsing;
Expand Down Expand Up @@ -260,8 +261,9 @@ public Handle range(GrammarAST a, GrammarAST b) {
ATNState right = newState(b);
int t1 = CharSupport.getCharValueFromGrammarCharLiteral(a.getText());
int t2 = CharSupport.getCharValueFromGrammarCharLiteral(b.getText());
checkRange(a, b, t1, t2);
left.addTransition(CodePointTransitions.createWithCodePointRange(right, t1, t2, caseInsensitive));
if (checkRange(a, b, t1, t2)) {
left.addTransition(createTransition(right, t1, t2, a));
}
a.atnState = left;
b.atnState = left;
return new Handle(left, right);
Expand Down Expand Up @@ -305,7 +307,7 @@ else if ( t.getType()==ANTLRParser.TOKEN_REF ) {
Transition transition;
if (set.getIntervals().size() == 1) {
Interval interval = set.getIntervals().get(0);
transition = CodePointTransitions.createWithCodePointRange(right, interval.a, interval.b, caseInsensitive);
transition = CodePointTransitions.createWithCodePointRange(right, interval.a, interval.b);
}
else {
transition = new SetTransition(right, set);
Expand All @@ -329,13 +331,14 @@ protected boolean checkRange(GrammarAST leftNode, GrammarAST rightNode, int left
g.tool.errMgr.grammarError(ErrorType.INVALID_LITERAL_IN_LEXER_SET,
g.fileName, rightNode.getToken(), rightNode.getText());
}
if (!result) return result;
if (!result) return false;

if (rightValue < leftValue) {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName, leftNode.parent.getToken(), leftNode.getText() + ".." + rightNode.getText());
return false;
}
return result;
return true;
}

/** For a lexer, a string is a sequence of char to match. That is,
Expand All @@ -362,7 +365,7 @@ public Handle stringLiteral(TerminalAST stringLiteralAST) {
for (int i = 0; i < n; ) {
right = newState(stringLiteralAST);
int codePoint = s.codePointAt(i);
prev.addTransition(CodePointTransitions.createWithCodePoint(right, codePoint, caseInsensitive));
prev.addTransition(createTransition(right, codePoint, codePoint, stringLiteralAST));
prev = right;
i += Character.charCount(codePoint);
}
Expand Down Expand Up @@ -568,16 +571,15 @@ private void checkCharAndAddToSet(GrammarAST ast, IntervalSet set, int c) {
}

private void checkRangeAndAddToSet(GrammarAST ast, IntervalSet set, int a, int b, boolean caseInsensitive) {
RangeBorderCharactersData charactersData = RangeBorderCharactersData.getAndCheckCharactersData(a, b, g, ast);
if (caseInsensitive) {
int lowerA = Character.toLowerCase(a);
int upperA = Character.toUpperCase(a);
int lowerB = Character.toLowerCase(b);
int upperB = Character.toUpperCase(b);
if (lowerA == upperA && upperB == lowerB) {
if (charactersData.lowerFrom == charactersData.upperFrom && charactersData.lowerTo == charactersData.upperTo ||
charactersData.mixOfLowerAndUpperCharCase
) {
checkRangeAndAddToSet(ast, set, a, b, false);
} else {
checkRangeAndAddToSet(ast, set, lowerA, lowerB, false);
checkRangeAndAddToSet(ast, set, upperA, upperB, false);
checkRangeAndAddToSet(ast, set, charactersData.lowerFrom, charactersData.lowerTo, false);
checkRangeAndAddToSet(ast, set, charactersData.upperFrom, charactersData.upperTo, false);
}
} else {
for (int i = a; i <= b; i++) {
Expand Down Expand Up @@ -610,6 +612,24 @@ private void checkRangeAndAddToSet(GrammarAST ast, IntervalSet set, int a, int b
}
}

private Transition createTransition(ATNState target, int from, int to, CommonTree tree) {
RangeBorderCharactersData charactersData = RangeBorderCharactersData.getAndCheckCharactersData(from, to, g, tree);
if (caseInsensitive) {
if (charactersData.lowerFrom == charactersData.upperFrom && charactersData.lowerTo == charactersData.upperTo ||
charactersData.mixOfLowerAndUpperCharCase
) {
return CodePointTransitions.createWithCodePointRange(target, from, to);
} else {
IntervalSet intervalSet = new IntervalSet();
intervalSet.add(charactersData.lowerFrom, charactersData.lowerTo);
intervalSet.add(charactersData.upperFrom, charactersData.upperTo);
return new SetTransition(target, intervalSet);
}
} else {
return CodePointTransitions.createWithCodePointRange(target, from, to);
}
}

@Override
public Handle tokenRef(TerminalAST node) {
// Ref to EOF in lexer yields char transition on -1
Expand Down
44 changes: 44 additions & 0 deletions tool/src/org/antlr/v4/automata/RangeBorderCharactersData.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package org.antlr.v4.automata;

import org.antlr.runtime.tree.CommonTree;
import org.antlr.v4.tool.ErrorType;
import org.antlr.v4.tool.Grammar;

public class RangeBorderCharactersData {
public int lowerFrom;
public int upperFrom;
public int lowerTo;
public int upperTo;
public boolean mixOfLowerAndUpperCharCase;

public RangeBorderCharactersData(int lowerFrom, int upperFrom, int lowerTo, int upperTo, boolean mixOfLowerAndUpperCharCase) {
this.lowerFrom = lowerFrom;
this.upperFrom = upperFrom;
this.lowerTo = lowerTo;
this.upperTo = upperTo;
this.mixOfLowerAndUpperCharCase = mixOfLowerAndUpperCharCase;
}

public static RangeBorderCharactersData getAndCheckCharactersData(int from, int to, Grammar grammar, CommonTree tree) {
int lowerFrom = Character.toLowerCase(from);
int upperFrom = Character.toUpperCase(from);
int lowerTo = Character.toLowerCase(to);
int upperTo = Character.toUpperCase(to);
boolean isLowerFrom = lowerFrom == from;
boolean isLowerTo = lowerTo == to;
boolean mixOfLowerAndUpperCharCase = isLowerFrom && !isLowerTo || !isLowerFrom && isLowerTo;
if (mixOfLowerAndUpperCharCase) {
StringBuilder notImpliedCharacters = new StringBuilder();
for (int i = from; i < to; i++) {
if (Character.toLowerCase(i) == Character.toUpperCase(i)) {
notImpliedCharacters.append((char)i);
}
}
if (notImpliedCharacters.length() > 0) {
grammar.tool.errMgr.grammarError(ErrorType.RANGE_PROBABLY_CONTAINS_NOT_IMPLIED_CHARACTERS, grammar.fileName, tree.getToken(),
(char) from, (char) to, notImpliedCharacters.toString());
}
}
return new RangeBorderCharactersData(lowerFrom, upperFrom, lowerTo, upperTo, mixOfLowerAndUpperCharCase);
}
}
19 changes: 19 additions & 0 deletions tool/src/org/antlr/v4/tool/ErrorType.java
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,25 @@ public enum ErrorType {
"One of the token <arg> values unreachable. <arg2> is always overlapped by token <arg3>",
ErrorSeverity.WARNING),

/**
* <p>Range probably contains not implied characters. Both bounds should be defined in lower or UPPER case
* For instance, the range [A-z] (ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxy)
* probably contains not implied characters: [\]^_`
*
* Use the following definition: [A-Za-z]
* If the characters are implied, include them explicitly: [A-Za-z[\\\]^_`]
* </p>
*
* <pre>
* TOKEN: [A-z]; // warning
* </pre>
*/
RANGE_PROBABLY_CONTAINS_NOT_IMPLIED_CHARACTERS(
185,
"Range <arg>..<arg2> probably contains not implied characters <arg3>. Both bounds should be defined in lower or UPPER case",
ErrorSeverity.WARNING
),

/*
* Backward incompatibility errors
*/
Expand Down

0 comments on commit 7fa59a4

Please sign in to comment.