Skip to content

Commit

Permalink
Fix Lexer greediness
Browse files Browse the repository at this point in the history
Some edge cases around numbers were not handled as expected. This commit adds
test cases from the 2 RFCs clarifying the expected behaviour (
graphql/graphql-spec#601,
graphql/graphql-spec#599) and updates the Lexer to match.

This is technically a breaking change but most cases were likely to lead
to validation errors (e.g. "0xF1" being parsed as [0, xF1] when
expecting a list of integers).
  • Loading branch information
lirsacc committed Jan 28, 2020
1 parent bb72c22 commit 3f367d5
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 19 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ Unreleased
- `SchemaVisitor.(on_field_definition|on_argument_definition|on_input_field_definition)` have become `SchemaVisitor.(on_field|on_argument|on_input_field)` to maintain consistency with other methods.
- Do not expose `GraphQLExtension` on `py_gql`.

- Fix Lexer greediness. Some edge cases were not handled as expected. This commit adds test cases from the 2 RFCs clarifying the expected behaviour ([graphql/graphql-spec#601](https://github.com/graphql/graphql-spec/pull/601), [graphql/graphql-spec#599](https://github.com/graphql/graphql-spec/pull/599)) and updates the Lexer to match. This is _technically_ a breaking change but most cases were likely to lead to validation errors (e.g. "0xF1" being parsed as [0, xF1] when expecting a list of integers).

[0.4.0](https://github.com/lirsacc/py-gql/releases/tag/0.4.0) - 2019-10-10
--------------------------------------------------------------------------

Expand Down
55 changes: 37 additions & 18 deletions py_gql/lang/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

from .._string_utils import ensure_unicode, parse_block_string
from ..exc import (
GraphQLSyntaxError,
InvalidCharacter,
InvalidEscapeSequence,
NonTerminatedString,
Expand Down Expand Up @@ -47,12 +46,14 @@
Token,
)

EOL_CHARS = frozenset([0x000A, 0x000D]) # "\n" # "\r"
EOL_CHARS = [0x000A, 0x000D] # "\n" # "\r"

IGNORED_CHARS = (
frozenset([0xFEFF, 0x0009, 0x0020, 0x002C])
| EOL_CHARS # BOM # \t # SPACE # ,
)
IGNORED_CHARS = [ # BOM # \t # SPACE # ,
0xFEFF,
0x0009,
0x0020,
0x002C,
] + EOL_CHARS

SYMBOLS = {
cls.value: cls
Expand Down Expand Up @@ -86,14 +87,27 @@


def _unexpected(
expected: str, char: str, position: int, source: str
) -> GraphQLSyntaxError:
char: Optional[str],
position: int,
source: str,
expected: Optional[str] = None,
) -> Union[UnexpectedEOF, UnexpectedCharacter]:
if char is None:
return UnexpectedEOF(position - 1, source)
else:
elif expected is not None:
return UnexpectedCharacter(
'Expected "%s" but found "%s"' % (expected, char), position, source
)
else:
return UnexpectedCharacter(
'Unexpected character "%s"' % char, position, source
)


def _is_name_start(code):
return (
code == 0x005F or 0x0041 <= code <= 0x005A or 0x0061 <= code <= 0x007A
)


class Lexer:
Expand Down Expand Up @@ -165,9 +179,7 @@ def _read_ellipsis(self) -> Ellip:
char = self._peek()
self._position += 1
if char != ".":
raise _unexpected(
".", cast(str, char), self._position, self._source
)
raise _unexpected(char, self._position, self._source, ".")
return Ellip(start, self._position)

def _read_string(self) -> String:
Expand Down Expand Up @@ -296,6 +308,17 @@ def _read_number(self) -> Union[Integer, Float]:

self._read_over_integer()

# Explicit lookahead restrictions.
next_char = self._peek()
if next_char is not None:
next_code = ord(next_char)
if _is_name_start(next_code):
raise UnexpectedCharacter(
'Unexpected character "%s"' % char,
self._position,
self._source,
)

end = self._position
value = self._source[start:end]
return (
Expand All @@ -312,7 +335,7 @@ def _read_over_integer(self):
if code == 0x0030: # "0"
self._position += 1
char = self._peek()
if char is not None and ord(char) == 0x0030:
if char is not None and (0x0030 <= ord(char) <= 0x0039):
raise UnexpectedCharacter(
'Unexpected character "%s"' % char,
self._position,
Expand Down Expand Up @@ -406,11 +429,7 @@ def __next__(self) -> Token:
return self._read_string()
elif code == 0x002D or 0x0030 <= code <= 0x0039:
return self._read_number()
elif (
code == 0x005F
or 0x0041 <= code <= 0x005A
or 0x0061 <= code <= 0x007A
):
elif _is_name_start(code):
return self._read_name()
else:
raise UnexpectedCharacter(
Expand Down
39 changes: 38 additions & 1 deletion tests/test_lang/test_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
def lex_one(source):
lexer = Lexer(source)
assert type(next(lexer)) == token.SOF
return next(lexer)
val = next(lexer)
assert type(next(lexer)) == token.EOF
return val


def test_it_disallows_uncommon_control_characters():
Expand Down Expand Up @@ -101,6 +103,20 @@ def test_errors_respect_whitespace():
)


@pytest.mark.parametrize(
"value,expected",
[
("abc", token.Name(0, 3, "abc")),
("_abc", token.Name(0, 4, "_abc")),
("abc_", token.Name(0, 4, "abc_")),
("abc123", token.Name(0, 6, "abc123")),
("abc_123", token.Name(0, 7, "abc_123")),
],
)
def test_it_lexes_name(value, expected):
assert lex_one(value) == expected


@pytest.mark.parametrize(
"value,expected",
[
Expand All @@ -116,6 +132,7 @@ def test_errors_respect_whitespace():
'"unicode \\u1234\\u5678\\u90AB\\uCDEF"',
token.String(0, 34, "unicode \u1234\u5678\u90AB\uCDEF"),
),
('""', token.String(0, 2, "")),
],
)
def test_it_lexes_strings(value, expected):
Expand All @@ -126,6 +143,8 @@ def test_it_lexes_strings(value, expected):
"value, err_cls, expected_positon",
[
('"', NonTerminatedString, 1),
('"""', NonTerminatedString, 3),
('""""', NonTerminatedString, 4),
('"no end quote', NonTerminatedString, 13),
("'single quotes'", UnexpectedCharacter, 0),
('"contains unescaped \u0007 control char"', InvalidCharacter, 20),
Expand Down Expand Up @@ -154,6 +173,7 @@ def test_it_lex_reports_useful_string_errors(value, err_cls, expected_positon):
"value, expected",
[
('"""simple"""', token.BlockString(0, 12, "simple")),
('""""""', token.BlockString(0, 6, "")),
('""" white space """', token.BlockString(0, 19, " white space ")),
(
'"""contains " quote"""',
Expand Down Expand Up @@ -223,6 +243,7 @@ def test_it_lex_reports_useful_block_string_errors(
("123E4", token.Float(0, 5, "123E4")),
("123e-4", token.Float(0, 6, "123e-4")),
("123e+4", token.Float(0, 6, "123e+4")),
("1.2e3", token.Float(0, 5, "1.2e3")),
("-123e4", token.Float(0, 6, "-123e4")),
("-123E4", token.Float(0, 6, "-123E4")),
("-123e-4", token.Float(0, 7, "-123e-4")),
Expand All @@ -238,6 +259,7 @@ def test_it_lexes_numbers(string, expected):
"value, err_cls, expected_positon",
[
("00", UnexpectedCharacter, 1),
("01", UnexpectedCharacter, 1),
("+1", UnexpectedCharacter, 0),
("1.", UnexpectedEOF, 2),
("1.e1", UnexpectedCharacter, 2),
Expand All @@ -246,6 +268,21 @@ def test_it_lexes_numbers(string, expected):
("-A", UnexpectedCharacter, 1),
("1.0e", UnexpectedEOF, 4),
("1.0eA", UnexpectedCharacter, 4),
("123.", UnexpectedEOF, 4),
("123e", UnexpectedEOF, 4),
("123E", UnexpectedEOF, 4),
("01.23", UnexpectedCharacter, 1),
("1.2e3.4", UnexpectedCharacter, 7),
("1.23.4", UnexpectedCharacter, 6),
("1.2e3e", UnexpectedCharacter, 5),
("0xF1", UnexpectedCharacter, 1),
("0b10", UnexpectedCharacter, 1),
("123abc", UnexpectedCharacter, 3),
("1_234", UnexpectedCharacter, 1),
("1ß", UnexpectedCharacter, 1),
("1.23f", UnexpectedCharacter, 4),
("1.234_5", UnexpectedCharacter, 5),
("1.2ß", UnexpectedCharacter, 3),
],
)
def test_it_lex_reports_useful_number_errors(value, err_cls, expected_positon):
Expand Down

0 comments on commit 3f367d5

Please sign in to comment.