Skip to content

Commit

Permalink
EscapeSequenceParsing
Browse files Browse the repository at this point in the history
  • Loading branch information
bhamiltoncx committed Feb 24, 2017
1 parent 219b88e commit 54220ed
Show file tree
Hide file tree
Showing 3 changed files with 285 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/

package org.antlr.v4.test.tool;

import org.antlr.v4.misc.EscapeSequenceParsing;
import org.antlr.v4.runtime.misc.IntervalSet;

import org.junit.Test;

import static org.antlr.v4.misc.EscapeSequenceParsing.Result;
import static org.junit.Assert.assertEquals;

public class TestEscapeSequenceParsing {
@Test
public void testParseEmpty() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("", 0));
}

@Test
public void testParseJustBackslash() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\", 0));
}

@Test
public void testParseInvalidEscape() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\z", 0));
}

@Test
public void testParseNewline() {
assertEquals(
new Result(Result.Type.INTERVAL_SET, IntervalSet.of('\n'), 2),
EscapeSequenceParsing.parseEscape("\\n", 0));
}

@Test
public void testParseUnicodeTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\uABC", 0));
}

@Test
public void testParseUnicodeBMP() {
assertEquals(
new Result(Result.Type.INTERVAL_SET, IntervalSet.of(0xABCD), 6),
EscapeSequenceParsing.parseEscape("\\uABCD", 0));
}

@Test
public void testParseUnicodeSMPTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\u{}", 0));
}

@Test
public void testParseUnicodeSMPMissingCloseBrace() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\u{12345", 0));
}

@Test
public void testParseUnicodeTooBig() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\u{110000}", 0));
}

@Test
public void testParseUnicodeSMP() {
assertEquals(
new Result(Result.Type.INTERVAL_SET, IntervalSet.of(0x10ABCD), 10),
EscapeSequenceParsing.parseEscape("\\u{10ABCD}", 0));
}

@Test
public void testParseUnicodePropertyTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\p{}", 0));
}

@Test
public void testParseUnicodePropertyMissingCloseBrace() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\p{1234", 0));
}

@Test
public void testParseUnicodeProperty() {
assertEquals(
new Result(Result.Type.INTERVAL_SET, IntervalSet.of(66560, 66639), 11),
EscapeSequenceParsing.parseEscape("\\p{Deseret}", 0));
}

@Test
public void testParseUnicodePropertyInvertedTooShort() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\P{}", 0));
}

@Test
public void testParseUnicodePropertyInvertedMissingCloseBrace() {
assertEquals(
EscapeSequenceParsing.Result.INVALID,
EscapeSequenceParsing.parseEscape("\\P{Deseret", 0));
}

@Test
public void testParseUnicodePropertyInverted() {
IntervalSet expected = IntervalSet.of(0, 66559);
expected.add(66640, Character.MAX_CODE_POINT);
assertEquals(
new Result(Result.Type.INTERVAL_SET, expected, 11),
EscapeSequenceParsing.parseEscape("\\P{Deseret}", 0));
}
}
2 changes: 1 addition & 1 deletion tool/src/org/antlr/v4/misc/CharSupport.java
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ public static int getCharValueFromCharInGrammarLiteral(String cstr) {
}
}

private static int parseHexValue(String cstr, int startOff, int endOff) {
public static int parseHexValue(String cstr, int startOff, int endOff) {
if (startOff < 0 || endOff < 0) {
return -1;
}
Expand Down
153 changes: 153 additions & 0 deletions tool/src/org/antlr/v4/misc/EscapeSequenceParsing.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/

package org.antlr.v4.misc;

import java.util.Objects;

import org.antlr.v4.runtime.Token;
import org.antlr.v4.runtime.misc.IntervalSet;
import org.antlr.v4.unicode.UnicodeData;

/**
* Utility class to parse escapes like:
* \\n
* \\uABCD
* \\u{10ABCD}
* \\p{Foo}
* \\P{Bar}
*/
public abstract class EscapeSequenceParsing {
public static class Result {
public enum Type {
INVALID,
INTERVAL_SET
};

public static Result INVALID = new Result(Type.INVALID, IntervalSet.EMPTY_SET, -1);

public final Type type;
public final IntervalSet intervalSet;
public final int parseLength;

public Result(Type type, IntervalSet intervalSet, int parseLength) {
this.type = type;
this.intervalSet = intervalSet;
this.parseLength = parseLength;
}

@Override
public String toString() {
return String.format(
"%s type=%s intervalSet=%s parseLength=%d",
super.toString(),
type,
intervalSet,
parseLength);
}

@Override
public boolean equals(Object other) {
if (!(other instanceof Result)) {
return false;
}
Result that = (Result) other;
if (this == that) {
return true;
}
return Objects.equals(this.type, that.type) &&
Objects.equals(this.intervalSet, that.intervalSet) &&
Objects.equals(this.parseLength, that.parseLength);
}

@Override
public int hashCode() {
return Objects.hash(type, intervalSet, parseLength);
}
}

/**
* Parses a single escape sequence starting at {@code startOff}.
*
* Returns {@link Result.INVALID} if no valid escape sequence was found, a Result otherwise.
*/
public static Result parseEscape(String s, int startOff) {
int offset = startOff;
if (offset + 2 > s.length() || s.codePointAt(offset) != '\\') {
return Result.INVALID;
}
// Move past backslash
offset++;
int escaped = s.codePointAt(offset);
// Move past escaped code point
offset += Character.charCount(escaped);
if (escaped == 'u') {
// \\u{1} is the shortest we support
if (offset + 3 > s.length()) {
return Result.INVALID;
}
int hexStartOffset;
int hexEndOffset;
if (s.codePointAt(offset) == '{') {
hexStartOffset = offset + 1;
hexEndOffset = s.indexOf('}', hexStartOffset);
if (hexEndOffset == -1) {
return Result.INVALID;
}
offset = hexEndOffset + 1;
} else {
if (offset + 4 > s.length()) {
return Result.INVALID;
}
hexStartOffset = offset;
hexEndOffset = offset + 4;
offset = hexEndOffset;
}
int codePointValue = CharSupport.parseHexValue(s, hexStartOffset, hexEndOffset);
if (codePointValue == -1 || codePointValue > Character.MAX_CODE_POINT) {
return Result.INVALID;
}
return new Result(
Result.Type.INTERVAL_SET,
IntervalSet.of(codePointValue),
offset - startOff);
} else if (escaped == 'p' || escaped == 'P') {
// \p{L} is the shortest we support
if (offset + 3 > s.length() || s.codePointAt(offset) != '{') {
return Result.INVALID;
}
int openBraceOffset = offset;
int closeBraceOffset = s.indexOf('}', openBraceOffset);
if (closeBraceOffset == -1) {
return Result.INVALID;
}
String propertyName = s.substring(openBraceOffset + 1, closeBraceOffset);
IntervalSet propertyIntervalSet = UnicodeData.getPropertyCodePoints(propertyName);
if (propertyIntervalSet == null) {
return Result.INVALID;
}
offset = closeBraceOffset + 1;
if (escaped == 'P') {
propertyIntervalSet = propertyIntervalSet.complement(IntervalSet.COMPLETE_CHAR_SET);
}
return new Result(
Result.Type.INTERVAL_SET,
propertyIntervalSet,
offset - startOff);
} else if (escaped < CharSupport.ANTLRLiteralEscapedCharValue.length) {
int codePoint = CharSupport.ANTLRLiteralEscapedCharValue[escaped];
if (codePoint == 0) {
return Result.INVALID;
}
return new Result(
Result.Type.INTERVAL_SET,
IntervalSet.of(codePoint),
offset - startOff);
} else {
return Result.INVALID;
}
}
}

0 comments on commit 54220ed

Please sign in to comment.