Fix Lexer greediness

Some edge cases around numbers were not handled as expected. This commit adds test cases from the 2 RFCs clarifying the expected behaviour ( graphql/graphql-spec#601, graphql/graphql-spec#599) and updates the Lexer to match. This is technically a breaking change but most cases were likely to lead to validation errors (e.g. "0xF1" being parsed as [0, xF1] when expecting a list of integers).
lirsacc · Jan 28, 2020 · 3f367d5 · 3f367d5
1 parent bb72c22
commit 3f367d5
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 19 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -47,6 +47,8 @@ Unreleased
 - `SchemaVisitor.(on_field_definition|on_argument_definition|on_input_field_definition)` have become `SchemaVisitor.(on_field|on_argument|on_input_field)` to maintain consistency with other methods.
 - Do not expose `GraphQLExtension` on `py_gql`.
 
+- Fix Lexer greediness. Some edge cases were not handled as expected. This commit adds test cases from the 2 RFCs clarifying the expected behaviour ([graphql/graphql-spec#601](https://github.com/graphql/graphql-spec/pull/601), [graphql/graphql-spec#599](https://github.com/graphql/graphql-spec/pull/599)) and updates the Lexer to match. This is _technically_ a breaking change but most cases were likely to lead to validation errors (e.g. "0xF1" being parsed as [0, xF1] when expecting a list of integers).
+
 [0.4.0](https://github.com/lirsacc/py-gql/releases/tag/0.4.0) - 2019-10-10
 --------------------------------------------------------------------------
 

diff --git a/py_gql/lang/lexer.py b/py_gql/lang/lexer.py
@@ -15,7 +15,6 @@
 
 from .._string_utils import ensure_unicode, parse_block_string
 from ..exc import (
-    GraphQLSyntaxError,
     InvalidCharacter,
     InvalidEscapeSequence,
     NonTerminatedString,
@@ -47,12 +46,14 @@
     Token,
 )
 
-EOL_CHARS = frozenset([0x000A, 0x000D])  # "\n"  # "\r"
+EOL_CHARS = [0x000A, 0x000D]  # "\n"  # "\r"
 
-IGNORED_CHARS = (
-    frozenset([0xFEFF, 0x0009, 0x0020, 0x002C])
-    | EOL_CHARS  # BOM  # \t  # SPACE  # ,
-)
+IGNORED_CHARS = [  # BOM  # \t  # SPACE  # ,
+    0xFEFF,
+    0x0009,
+    0x0020,
+    0x002C,
+] + EOL_CHARS
 
 SYMBOLS = {
     cls.value: cls
@@ -86,14 +87,27 @@
 
 
 def _unexpected(
-    expected: str, char: str, position: int, source: str
-) -> GraphQLSyntaxError:
+    char: Optional[str],
+    position: int,
+    source: str,
+    expected: Optional[str] = None,
+) -> Union[UnexpectedEOF, UnexpectedCharacter]:
     if char is None:
         return UnexpectedEOF(position - 1, source)
-    else:
+    elif expected is not None:
         return UnexpectedCharacter(
             'Expected "%s" but found "%s"' % (expected, char), position, source
         )
+    else:
+        return UnexpectedCharacter(
+            'Unexpected character "%s"' % char, position, source
+        )
+
+
+def _is_name_start(code):
+    return (
+        code == 0x005F or 0x0041 <= code <= 0x005A or 0x0061 <= code <= 0x007A
+    )
 
 
 class Lexer:
@@ -165,9 +179,7 @@ def _read_ellipsis(self) -> Ellip:
             char = self._peek()
             self._position += 1
             if char != ".":
-                raise _unexpected(
-                    ".", cast(str, char), self._position, self._source
-                )
+                raise _unexpected(char, self._position, self._source, ".")
         return Ellip(start, self._position)
 
     def _read_string(self) -> String:
@@ -296,6 +308,17 @@ def _read_number(self) -> Union[Integer, Float]:
 
             self._read_over_integer()
 
+        # Explicit lookahead restrictions.
+        next_char = self._peek()
+        if next_char is not None:
+            next_code = ord(next_char)
+            if _is_name_start(next_code):
+                raise UnexpectedCharacter(
+                    'Unexpected character "%s"' % char,
+                    self._position,
+                    self._source,
+                )
+
         end = self._position
         value = self._source[start:end]
         return (
@@ -312,7 +335,7 @@ def _read_over_integer(self):
         if code == 0x0030:  # "0"
             self._position += 1
             char = self._peek()
-            if char is not None and ord(char) == 0x0030:
+            if char is not None and (0x0030 <= ord(char) <= 0x0039):
                 raise UnexpectedCharacter(
                     'Unexpected character "%s"' % char,
                     self._position,
@@ -406,11 +429,7 @@ def __next__(self) -> Token:
             return self._read_string()
         elif code == 0x002D or 0x0030 <= code <= 0x0039:
             return self._read_number()
-        elif (
-            code == 0x005F
-            or 0x0041 <= code <= 0x005A
-            or 0x0061 <= code <= 0x007A
-        ):
+        elif _is_name_start(code):
             return self._read_name()
         else:
             raise UnexpectedCharacter(

diff --git a/tests/test_lang/test_lexer.py b/tests/test_lang/test_lexer.py
@@ -19,7 +19,9 @@
 def lex_one(source):
     lexer = Lexer(source)
     assert type(next(lexer)) == token.SOF
-    return next(lexer)
+    val = next(lexer)
+    assert type(next(lexer)) == token.EOF
+    return val
 
 
 def test_it_disallows_uncommon_control_characters():
@@ -101,6 +103,20 @@ def test_errors_respect_whitespace():
     )
 
 
+@pytest.mark.parametrize(
+    "value,expected",
+    [
+        ("abc", token.Name(0, 3, "abc")),
+        ("_abc", token.Name(0, 4, "_abc")),
+        ("abc_", token.Name(0, 4, "abc_")),
+        ("abc123", token.Name(0, 6, "abc123")),
+        ("abc_123", token.Name(0, 7, "abc_123")),
+    ],
+)
+def test_it_lexes_name(value, expected):
+    assert lex_one(value) == expected
+
+
 @pytest.mark.parametrize(
     "value,expected",
     [
@@ -116,6 +132,7 @@ def test_errors_respect_whitespace():
             '"unicode \\u1234\\u5678\\u90AB\\uCDEF"',
             token.String(0, 34, "unicode \u1234\u5678\u90AB\uCDEF"),
         ),
+        ('""', token.String(0, 2, "")),
     ],
 )
 def test_it_lexes_strings(value, expected):
@@ -126,6 +143,8 @@ def test_it_lexes_strings(value, expected):
     "value, err_cls, expected_positon",
     [
         ('"', NonTerminatedString, 1),
+        ('"""', NonTerminatedString, 3),
+        ('""""', NonTerminatedString, 4),
         ('"no end quote', NonTerminatedString, 13),
         ("'single quotes'", UnexpectedCharacter, 0),
         ('"contains unescaped \u0007 control char"', InvalidCharacter, 20),
@@ -154,6 +173,7 @@ def test_it_lex_reports_useful_string_errors(value, err_cls, expected_positon):
     "value, expected",
     [
         ('"""simple"""', token.BlockString(0, 12, "simple")),
+        ('""""""', token.BlockString(0, 6, "")),
         ('""" white space """', token.BlockString(0, 19, " white space ")),
         (
             '"""contains " quote"""',
@@ -223,6 +243,7 @@ def test_it_lex_reports_useful_block_string_errors(
         ("123E4", token.Float(0, 5, "123E4")),
         ("123e-4", token.Float(0, 6, "123e-4")),
         ("123e+4", token.Float(0, 6, "123e+4")),
+        ("1.2e3", token.Float(0, 5, "1.2e3")),
         ("-123e4", token.Float(0, 6, "-123e4")),
         ("-123E4", token.Float(0, 6, "-123E4")),
         ("-123e-4", token.Float(0, 7, "-123e-4")),
@@ -238,6 +259,7 @@ def test_it_lexes_numbers(string, expected):
     "value, err_cls, expected_positon",
     [
         ("00", UnexpectedCharacter, 1),
+        ("01", UnexpectedCharacter, 1),
         ("+1", UnexpectedCharacter, 0),
         ("1.", UnexpectedEOF, 2),
         ("1.e1", UnexpectedCharacter, 2),
@@ -246,6 +268,21 @@ def test_it_lexes_numbers(string, expected):
         ("-A", UnexpectedCharacter, 1),
         ("1.0e", UnexpectedEOF, 4),
         ("1.0eA", UnexpectedCharacter, 4),
+        ("123.", UnexpectedEOF, 4),
+        ("123e", UnexpectedEOF, 4),
+        ("123E", UnexpectedEOF, 4),
+        ("01.23", UnexpectedCharacter, 1),
+        ("1.2e3.4", UnexpectedCharacter, 7),
+        ("1.23.4", UnexpectedCharacter, 6),
+        ("1.2e3e", UnexpectedCharacter, 5),
+        ("0xF1", UnexpectedCharacter, 1),
+        ("0b10", UnexpectedCharacter, 1),
+        ("123abc", UnexpectedCharacter, 3),
+        ("1_234", UnexpectedCharacter, 1),
+        ("1ß", UnexpectedCharacter, 1),
+        ("1.23f", UnexpectedCharacter, 4),
+        ("1.234_5", UnexpectedCharacter, 5),
+        ("1.2ß", UnexpectedCharacter, 3),
     ],
 )
 def test_it_lex_reports_useful_number_errors(value, err_cls, expected_positon):