Skip to content

Commit

Permalink
New \p{Foo} escape sequence
Browse files Browse the repository at this point in the history
  • Loading branch information
bhamiltoncx committed Feb 24, 2017
1 parent 54220ed commit 44ddf86
Show file tree
Hide file tree
Showing 3 changed files with 230 additions and 29 deletions.
123 changes: 123 additions & 0 deletions tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,129 @@ public void testA() throws Exception {
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSet() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [abc] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{97..99}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetRange() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [a-c] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{97..99}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeBMPEscape() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\uABCD] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-43981->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeBMPEscapeRange() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [a-c\\uABCD-\\uABFF] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{97..99, 43981..44031}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeSMPEscape() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\u{10ABCD}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-1092557->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeSMPEscapeRange() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{97..99, 1092557..1092607}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodePropertyEscape() throws Exception {
// The Gothic script is long dead and unlikely to change (which would
// cause this test to fail)
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\p{Gothic}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{66352..66378}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodePropertyInvertEscape() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\P{Gothic}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{0..66351, 66379..1114111}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception {
// Ditto the Mahajani script. Not going to change soon. I hope.
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\p{Gothic}\\p{Mahajani}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{66352..66378, 69968..70006}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testCharSetUnicodePropertyOverlap() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
"A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;"
);
String expecting =
"s0->RuleStart_A_1\n" +
"RuleStart_A_1->s3\n" +
"s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" +
"s4->RuleStop_A_2\n";
checkTokensRule(g, null, expecting);
}
@Test public void testRangeOrRange() throws Exception {
LexerGrammar g = new LexerGrammar(
"lexer grammar P;\n"+
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,40 @@ public void testSetUp() throws Exception {
super.testErrors(pair, true);
}

@Test public void testInvalidUnicodeEscapesInCharSet() {
String grammar =
"lexer grammar Test;\n" +
"INVALID_EXTENDED_UNICODE_EMPTY: [\\u{}];\n" +
"INVALID_EXTENDED_UNICODE_NOT_TERMINATED: [\\u{];\n" +
"INVALID_EXTENDED_UNICODE_TOO_LONG: [\\u{110000}];\n" +
"INVALID_UNICODE_PROPERTY_EMPTY: [\\p{}];\n" +
"INVALID_UNICODE_PROPERTY_NOT_TERMINATED: [\\p{];\n" +
"INVALID_INVERTED_UNICODE_PROPERTY_EMPTY: [\\P{}];\n" +
"INVALID_UNICODE_PROPERTY_UNKNOWN: [\\p{NotAProperty}];\n" +
"INVALID_INVERTED_UNICODE_PROPERTY_UNKNOWN: [\\P{NotAProperty}];\n" +
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\p{Foo}-\\p{Bar}];\n" +
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_2: [\\p{Foo}-Z];\n" +
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_3: [A-\\p{Foo}];\n" +
"INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Foo}-\\P{Bar}];\n";

String expected =
"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:23: multi-character literals are not allowed in lexer sets: 'GH'\n" +
"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:29: multi-character literals are not allowed in lexer sets: 'LM'\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:3:26: string literals and sets cannot be empty: 'F'..'A'\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:23: string literals and sets cannot be empty: [f-a]\n" +
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:29: string literals and sets cannot be empty: []\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:23: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:33: invalid escape sequence\n" +
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:23: invalid escape sequence\n";

String[] pair = new String[] {
grammar,
expected
};

super.testErrors(pair, true);
}

/**
* This test ensures the {@link ErrorType#UNRECOGNIZED_ASSOC_OPTION} warning
* is produced as described in the documentation.
Expand Down
102 changes: 73 additions & 29 deletions tool/src/org/antlr/v4/automata/LexerATNFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.antlr.runtime.Token;
import org.antlr.v4.codegen.CodeGenerator;
import org.antlr.v4.misc.CharSupport;
import org.antlr.v4.misc.EscapeSequenceParsing;
import org.antlr.v4.parse.ANTLRParser;
import org.antlr.v4.runtime.IntStream;
import org.antlr.v4.runtime.Lexer;
Expand Down Expand Up @@ -365,7 +366,7 @@ public Handle stringLiteral(TerminalAST stringLiteralAST) {
return new Handle(left, right);
}

/** [Aa\t \u1234a-z\]\-] char sets */
/** [Aa\t \u1234a-z\]\p{Letter}\-] char sets */
@Override
public Handle charSetLiteral(GrammarAST charSetAST) {
ATNState left = newState(charSetAST);
Expand All @@ -379,51 +380,94 @@ public Handle charSetLiteral(GrammarAST charSetAST) {
public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
String chars = charSetAST.getText();
chars = chars.substring(1, chars.length() - 1);
String cset = '"' + chars + '"';
IntervalSet set = new IntervalSet();

if (chars.length() == 0) {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName, charSetAST.getToken(), "[]");
return set;
}
// unescape all valid escape char like \n, leaving escaped dashes as '\-'
// so we can avoid seeing them as '-' range ops.
chars = CharSupport.getStringFromGrammarStringLiteral(cset);
if (chars == null) {
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
g.fileName, charSetAST.getToken());
return set;
}

int prevCodePoint = -1;
boolean inRange = false;
int n = chars.length();
// now make x-y become set of char
for (int i = 0; i < n; ) {
int c = chars.codePointAt(i);
int offset = Character.charCount(c);
if (c == '\\' && i+offset < n && chars.codePointAt(i+offset) == '-') { // \-
checkSetCollision(charSetAST, set, '-');
set.add('-');
offset++;
}
else if (i+offset+1 < n && chars.codePointAt(i+offset) == '-') { // range x-y
int x = c;
int y = chars.codePointAt(i+offset+1);
if (x <= y) {
checkSetCollision(charSetAST, set, x, y);
set.add(x,y);
if (c == '\\') {
EscapeSequenceParsing.Result escapeParseResult =
EscapeSequenceParsing.parseEscape(chars, i);
switch (escapeParseResult.type) {
case INVALID:
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
g.fileName, charSetAST.getToken());
return new IntervalSet();
case INTERVAL_SET:
int codePoint = escapeParseResult.intervalSet.getSingleElement();
boolean containsMultipleCodePoints = (codePoint == org.antlr.v4.runtime.Token.INVALID_TYPE);
if (inRange) {
if (containsMultipleCodePoints) {
// XXX make a proper error
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName, charSetAST.getToken(), "[]");
} else if (prevCodePoint > codePoint) {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, charSetAST.getToken(),
CharSupport.toRange(prevCodePoint, codePoint, CharSupport.ToRangeMode.BRACKETED));
} else {
checkSetCollision(charSetAST, set, prevCodePoint, codePoint);
set.add(prevCodePoint, codePoint);
}
inRange = false;
prevCodePoint = -1;
} else if (prevCodePoint != -1) {
checkSetCollision(charSetAST, set, prevCodePoint);
set.add(prevCodePoint);

if (containsMultipleCodePoints) {
prevCodePoint = -1;
} else {
prevCodePoint = codePoint;
}

set.addAll(escapeParseResult.intervalSet);
}
break;
}
else {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName, charSetAST.getToken(), CharSupport.toRange(x, y, CharSupport.ToRangeMode.BRACKETED));
} else if (inRange) {
if (prevCodePoint > c) {
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, charSetAST.getToken(),
CharSupport.toRange(prevCodePoint, c, CharSupport.ToRangeMode.BRACKETED));
}
checkSetCollision(charSetAST, set, prevCodePoint, c);
set.add(prevCodePoint, c);
inRange = false;
prevCodePoint = -1;
} else if (prevCodePoint != -1) {
if (c == '-') {
inRange = true;
} else {
checkSetCollision(charSetAST, set, prevCodePoint);
set.add(prevCodePoint);
prevCodePoint = c;
}
offset += Character.charCount(y) + 1;
} else {
if (c == '-') {
// XXX make a proper error
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
g.fileName, charSetAST.getToken(), "[]");
} else {
prevCodePoint = c;
}
else {
checkSetCollision(charSetAST, set, c);
set.add(c);
}
i += offset;
}
// Whether or not we were in a range, we'll add the last code point found to the set.
// If the range wasn't terminated, we'll treat it as a standalone codepoint.
if (prevCodePoint != -1) {
checkSetCollision(charSetAST, set, prevCodePoint);
set.add(prevCodePoint);
prevCodePoint = -1;
}
return set;
}

Expand Down

0 comments on commit 44ddf86

Please sign in to comment.