Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

documentation: use generic lexer infrastructure in Toy parser #3593

Merged
merged 4 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
292 changes: 181 additions & 111 deletions docs/Toy/toy/frontend/lexer.py
Original file line number Diff line number Diff line change
@@ -1,126 +1,196 @@
import re
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path

from .location import Location


@dataclass(init=False)
class Token:
file: Path
line: int
col: int
text: str

def __init__(self, file: Path, line: int, col: int, text: str):
self.file = file
self.line = line
self.col = col
self.text = text

@property
def loc(self):
return Location(self.file, self.line, self.col)

@classmethod
def name(cls):
return cls.__name__


@dataclass
class IdentifierToken(Token):
pass


@dataclass
class NumberToken(Token):
value: float


@dataclass
class OperatorToken(Token):
pass


@dataclass
class SpecialToken(Token):
pass


@dataclass
class EOFToken(Token):
pass

from string import hexdigits
from typing import TypeAlias, cast

from xdsl.utils.exceptions import ParseError
from xdsl.utils.lexer import Input, Lexer, Position, Span, Token


class ToyTokenKind(Enum):
SEMICOLON = auto()
PARENTHESE_OPEN = auto()
PARENTHESE_CLOSE = auto()
BRACKET_OPEN = auto()
BRACKET_CLOSE = auto()
SBRACKET_OPEN = auto()
SBRACKET_CLOSE = auto()
LT = auto()
GT = auto()
EQ = auto()
COMMA = auto()

EOF = auto()

RETURN = auto()
VAR = auto()
DEF = auto()

IDENTIFIER = auto()
NUMBER = auto()
OPERATOR = auto()


SINGLE_CHAR_TOKENS = {
";": ToyTokenKind.SEMICOLON,
"(": ToyTokenKind.PARENTHESE_OPEN,
")": ToyTokenKind.PARENTHESE_CLOSE,
"{": ToyTokenKind.BRACKET_OPEN,
"}": ToyTokenKind.BRACKET_CLOSE,
"[": ToyTokenKind.SBRACKET_OPEN,
"]": ToyTokenKind.SBRACKET_CLOSE,
"<": ToyTokenKind.LT,
">": ToyTokenKind.GT,
"=": ToyTokenKind.EQ,
",": ToyTokenKind.COMMA,
"+": ToyTokenKind.OPERATOR,
"-": ToyTokenKind.OPERATOR,
"*": ToyTokenKind.OPERATOR,
"/": ToyTokenKind.OPERATOR,
}

IDENTIFIER_CHARS = re.compile(r"[\w]|[\d]|_")
OPERATOR_CHARS = set("+-*/")
SPECIAL_CHARS = set("<>}{(),;=[]")
SPECIAL_CHARS = set(",")


ToyToken: TypeAlias = Token[ToyTokenKind]


class ToyLexer(Lexer[ToyTokenKind]):
def _is_in_bounds(self, size: Position = 1) -> bool:
"""
Check if the current position is within the bounds of the input.
"""
return self.pos + size - 1 < self.input.len

def _get_chars(self, size: int = 1) -> str | None:
"""
Get the character at the current location, or multiple characters ahead.
Return None if the position is out of bounds.
"""
res = self.input.slice(self.pos, self.pos + size)
self.pos += size
return res

def _peek_chars(self, size: int = 1) -> str | None:
"""
Peek at the character at the current location, or multiple characters ahead.
Return None if the position is out of bounds.
"""
return self.input.slice(self.pos, self.pos + size)

def _consume_chars(self, size: int = 1) -> None:
"""
Advance the lexer position in the input by the given amount.
"""
self.pos += size

def _consume_regex(self, regex: re.Pattern[str]) -> re.Match[str] | None:
"""
Advance the lexer position to the end of the next match of the given
regular expression.
"""
match = regex.match(self.input.content, self.pos)
if match is None:
return None
self.pos = match.end()
return match

_whitespace_regex = re.compile(r"((#[^\n]*(\n)?)|(\s+))*", re.ASCII)

def _consume_whitespace(self) -> None:
"""
Consume whitespace and comments.
"""
self._consume_regex(self._whitespace_regex)

def lex(self) -> ToyToken:
# First, skip whitespaces
self._consume_whitespace()

start_pos = self.pos
current_char = self._get_chars()

# Handle end of file
if current_char is None:
return self._form_token(ToyTokenKind.EOF, start_pos)

# bare identifier
if current_char.isalpha() or current_char == "_":
return self._lex_bare_identifier(start_pos)

# single-char punctuation that are not part of a multi-char token
single_char_token_kind = SINGLE_CHAR_TOKENS.get(current_char)
if single_char_token_kind is not None:
return self._form_token(single_char_token_kind, start_pos)

if current_char.isnumeric():
return self._lex_number(start_pos)

raise ParseError(
Span(start_pos, start_pos + 1, self.input),
f"Unexpected character: {current_char}",
)

IDENTIFIER_SUFFIX = r"[a-zA-Z0-9_$.]*"
bare_identifier_suffix_regex = re.compile(IDENTIFIER_SUFFIX)

def _lex_bare_identifier(self, start_pos: Position) -> ToyToken:
"""
Lex a bare identifier with the following grammar:
`bare-id ::= (letter|[_]) (letter|digit|[_$.])*`

The first character is expected to have already been parsed.
"""
self._consume_regex(self.bare_identifier_suffix_regex)

return self._form_token(ToyTokenKind.IDENTIFIER, start_pos)

_hexdigits_star_regex = re.compile(r"[0-9a-fA-F]*")
_digits_star_regex = re.compile(r"[0-9]*")
_fractional_suffix_regex = re.compile(r"\.[0-9]*([eE][+-]?[0-9]+)?")

def _lex_number(self, start_pos: Position) -> ToyToken:
"""
Lex a number literal, which is either a decimal or an hexadecimal.
The first character is expected to have already been parsed.
"""
first_digit = self.input.at(self.pos - 1)

# Hexadecimal case, we only parse it if we see the first '0x' characters,
# and then a first digit.
# Otherwise, a string like '0xi32' would not be parsed correctly.
if (
first_digit == "0"
and self._peek_chars() == "x"
and self._is_in_bounds(2)
and cast(str, self.input.at(self.pos + 1)) in hexdigits
):
self._consume_chars(2)
self._consume_regex(self._hexdigits_star_regex)
return self._form_token(ToyTokenKind.NUMBER, start_pos)

# Decimal case
self._consume_regex(self._digits_star_regex)

# Check if we are lexing a floating point
match = self._consume_regex(self._fractional_suffix_regex)
if match is not None:
return self._form_token(ToyTokenKind.NUMBER, start_pos)
return self._form_token(ToyTokenKind.NUMBER, start_pos)


def tokenize(file: Path, program: str | None = None):
tokens: list[Token] = []

if program is None:
with open(file) as f:
program = f.read()

text = ""
row = col = 1

def flush():
nonlocal col, row, text
n = len(text)
if n == 0:
return

true_col = col - n

if text[0].isnumeric():
value = float(text)
tokens.append(NumberToken(file, row, true_col, text, value))
else:
tokens.append(IdentifierToken(file, row, true_col, text))

text = ""

for row, line in enumerate(program.splitlines()):
# 1-indexed
row += 1
for col, char in enumerate(line):
# 1-indexed
col += 1
if char == "#":
# Comment
break

if IDENTIFIER_CHARS.match(char):
text += char
continue

if char == ".":
# parse floating point
if not text or (text[0].isnumeric() and "." not in text):
# allow `.5` and `5.5` but not `5.5.5`
text += char
continue

flush()

if char == " ":
continue

if char in OPERATOR_CHARS:
tokens.append(OperatorToken(file, row, col, char))
continue
elif char in SPECIAL_CHARS:
tokens.append(SpecialToken(file, row, col, char))
continue

raise AssertionError(f"unhandled char {char} at ({row}, {col}) in \n{line}")
toy_lexer = ToyLexer(Input(program, str(file)))

col += 1
flush()
tokens = [toy_lexer.lex()]

tokens.append(EOFToken(file, row, col, ""))
while tokens[-1].kind != ToyTokenKind.EOF:
tokens.append(toy_lexer.lex())

return tokens
30 changes: 28 additions & 2 deletions docs/Toy/toy/frontend/location.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,40 @@
import re
from dataclasses import dataclass
from pathlib import Path

from typing_extensions import Any

from xdsl.utils.lexer import Token


@dataclass
class Location:
"Structure definition a location in a file."

file: Path
file: str
line: int
col: int

def __repr__(self):
return f"{self.file}:{self.line}:{self.col}"


_NEWLINE = re.compile(r"\n")


def loc(token: Token[Any]) -> Location:
file = token.span.input.name
# Could be much faster

remaining = token.span.start
prev_end = 0

for line, newline_match in enumerate(
re.finditer(_NEWLINE, token.span.input.content)
):
len_line = newline_match.start() - prev_end
if remaining < len_line:
return Location(file, line + 1, remaining + 1)
remaining -= len_line + 1
prev_end = newline_match.end()

raise AssertionError(f"Could not find location of token {token}")
Loading
Loading