-
Notifications
You must be signed in to change notification settings - Fork 250
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Handwritten parser for parsing requirements (#484)
Co-authored-by: Brett Cannon <brett@python.org> Co-authored-by: Pradyun Gedam <pradyunsg@gmail.com>
- Loading branch information
1 parent
2bd5da3
commit 2e5593c
Showing
8 changed files
with
496 additions
and
208 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
# The docstring for each parse function contains the grammar for the rule. | ||
# The grammar uses a simple EBNF-inspired syntax: | ||
# | ||
# - Uppercase names are tokens | ||
# - Lowercase names are rules (parsed with a parse_* function) | ||
# - Parentheses are used for grouping | ||
# - A | means either-or | ||
# - A * means 0 or more | ||
# - A + means 1 or more | ||
# - A ? means 0 or 1 | ||
|
||
from ast import literal_eval | ||
from typing import Any, List, NamedTuple, Tuple, Union | ||
|
||
from ._tokenizer import Tokenizer | ||
|
||
|
||
class Node: | ||
def __init__(self, value: str) -> None: | ||
self.value = value | ||
|
||
def __str__(self) -> str: | ||
return self.value | ||
|
||
def __repr__(self) -> str: | ||
return f"<{self.__class__.__name__}('{self}')>" | ||
|
||
def serialize(self) -> str: | ||
raise NotImplementedError | ||
|
||
|
||
class Variable(Node): | ||
def serialize(self) -> str: | ||
return str(self) | ||
|
||
|
||
class Value(Node): | ||
def serialize(self) -> str: | ||
return f'"{self}"' | ||
|
||
|
||
class Op(Node): | ||
def serialize(self) -> str: | ||
return str(self) | ||
|
||
|
||
MarkerVar = Union[Variable, Value] | ||
MarkerItem = Tuple[MarkerVar, Op, MarkerVar] | ||
# MarkerAtom = Union[MarkerItem, List["MarkerAtom"]] | ||
# MarkerList = List[Union["MarkerList", MarkerAtom, str]] | ||
# mypy does not suport recursive type definition | ||
# https://github.com/python/mypy/issues/731 | ||
MarkerAtom = Any | ||
MarkerList = List[Any] | ||
|
||
|
||
class Requirement(NamedTuple): | ||
name: str | ||
url: str | ||
extras: List[str] | ||
specifier: str | ||
marker: str | ||
|
||
|
||
def parse_named_requirement(requirement: str) -> Requirement: | ||
""" | ||
named_requirement: | ||
IDENTIFIER extras (URL_SPEC | specifier) (SEMICOLON marker_expr)? END | ||
""" | ||
tokens = Tokenizer(requirement) | ||
tokens.expect("IDENTIFIER", error_message="Expression must begin with package name") | ||
name = tokens.read("IDENTIFIER").text | ||
extras = parse_extras(tokens) | ||
specifier = "" | ||
url = "" | ||
if tokens.match("URL_SPEC"): | ||
url = tokens.read().text[1:].strip() | ||
elif not tokens.match("END"): | ||
specifier = parse_specifier(tokens) | ||
if tokens.try_read("SEMICOLON"): | ||
marker = "" | ||
while not tokens.match("END"): | ||
# we don't validate markers here, it's done later as part of | ||
# packaging/requirements.py | ||
marker += tokens.read().text | ||
else: | ||
marker = "" | ||
tokens.expect( | ||
This comment was marked as off-topic.
Sorry, something went wrong. |
||
"END", | ||
error_message="Expected semicolon (followed by markers) or end of string", | ||
) | ||
return Requirement(name, url, extras, specifier, marker) | ||
|
||
|
||
def parse_extras(tokens: Tokenizer) -> List[str]: | ||
""" | ||
extras: LBRACKET (IDENTIFIER (COMMA IDENTIFIER)*)? RBRACKET | ||
""" | ||
extras = [] | ||
if tokens.try_read("LBRACKET"): | ||
while tokens.match("IDENTIFIER"): | ||
extras.append(tokens.read("IDENTIFIER").text) | ||
if not tokens.match("RBRACKET"): | ||
tokens.read("COMMA", error_message="Missing comma after extra") | ||
if not tokens.match("COMMA") and tokens.match("RBRACKET"): | ||
break | ||
tokens.read("RBRACKET", error_message="Closing square bracket is missing") | ||
return extras | ||
|
||
|
||
def parse_specifier(tokens: Tokenizer) -> str: | ||
""" | ||
specifier: | ||
LPAREN version_many? RPAREN | version_many | ||
""" | ||
lparen = False | ||
if tokens.try_read("LPAREN"): | ||
lparen = True | ||
parsed_specifiers = parse_version_many(tokens) | ||
if lparen and not tokens.try_read("RPAREN"): | ||
tokens.raise_syntax_error(message="Closing right parenthesis is missing") | ||
return parsed_specifiers | ||
|
||
|
||
def parse_version_many(tokens: Tokenizer) -> str: | ||
""" | ||
version_many: OP VERSION (COMMA OP VERSION)* | ||
""" | ||
parsed_specifiers = "" | ||
while tokens.match("OP"): | ||
parsed_specifiers += tokens.read("OP").text | ||
if tokens.match("VERSION"): | ||
parsed_specifiers += tokens.read("VERSION").text | ||
else: | ||
tokens.raise_syntax_error(message="Missing version") | ||
if not tokens.match("COMMA"): | ||
break | ||
tokens.expect("COMMA", error_message="Missing comma after version") | ||
parsed_specifiers += tokens.read("COMMA").text | ||
return parsed_specifiers | ||
|
||
|
||
def parse_marker_expr(tokens: Tokenizer) -> MarkerList: | ||
""" | ||
marker_expr: MARKER_ATOM (BOOLOP + MARKER_ATOM)+ | ||
""" | ||
expression = [parse_marker_atom(tokens)] | ||
while tokens.match("BOOLOP"): | ||
tok = tokens.read("BOOLOP") | ||
expr_right = parse_marker_atom(tokens) | ||
expression.extend((tok.text, expr_right)) | ||
return expression | ||
|
||
|
||
def parse_marker_atom(tokens: Tokenizer) -> MarkerAtom: | ||
""" | ||
marker_atom: LPAREN marker_expr RPAREN | marker_item | ||
""" | ||
if tokens.try_read("LPAREN"): | ||
marker = parse_marker_expr(tokens) | ||
tokens.read("RPAREN", error_message="Closing right parenthesis is missing") | ||
return marker | ||
else: | ||
return parse_marker_item(tokens) | ||
|
||
|
||
def parse_marker_item(tokens: Tokenizer) -> MarkerItem: | ||
""" | ||
marker_item: marker_var marker_op marker_var | ||
""" | ||
marker_var_left = parse_marker_var(tokens) | ||
marker_op = parse_marker_op(tokens) | ||
marker_var_right = parse_marker_var(tokens) | ||
return (marker_var_left, marker_op, marker_var_right) | ||
|
||
|
||
def parse_marker_var(tokens: Tokenizer) -> MarkerVar: | ||
""" | ||
marker_var: env_var | python_str | ||
""" | ||
if tokens.match("VARIABLE"): | ||
return parse_env_var(tokens) | ||
else: | ||
return parse_python_str(tokens) | ||
|
||
|
||
def parse_env_var(tokens: Tokenizer) -> Variable: | ||
""" | ||
env_var: VARIABLE | ||
""" | ||
env_var = tokens.read("VARIABLE").text.replace(".", "_") | ||
if ( | ||
env_var == "platform_python_implementation" | ||
or env_var == "python_implementation" | ||
): | ||
return Variable("platform_python_implementation") | ||
else: | ||
return Variable(env_var) | ||
|
||
|
||
def parse_python_str(tokens: Tokenizer) -> Value: | ||
""" | ||
python_str: QUOTED_STRING | ||
""" | ||
token = tokens.read( | ||
"QUOTED_STRING", | ||
error_message="String with single or double quote at the beginning is expected", | ||
).text | ||
python_str = literal_eval(token) | ||
return Value(str(python_str)) | ||
|
||
|
||
def parse_marker_op(tokens: Tokenizer) -> Op: | ||
""" | ||
marker_op: IN | NOT IN | OP | ||
""" | ||
if tokens.try_read("IN"): | ||
return Op("in") | ||
elif tokens.try_read("NOT"): | ||
tokens.read("IN", error_message="NOT token must be follewed by IN token") | ||
return Op("not in") | ||
elif tokens.match("OP"): | ||
return Op(tokens.read().text) | ||
else: | ||
return tokens.raise_syntax_error( | ||
message='Couldn\'t parse marker operator. Expecting one of \ | ||
"<=, <, !=, ==, >=, >, ~=, ===, not, not in"' | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
import re | ||
from typing import Dict, Generator, NoReturn, Optional | ||
|
||
from .specifiers import Specifier | ||
|
||
|
||
class Token: | ||
def __init__(self, name: str, text: str, position: int) -> None: | ||
self.name = name | ||
self.text = text | ||
self.position = position | ||
|
||
def matches(self, name: str = "") -> bool: | ||
if name and self.name != name: | ||
return False | ||
return True | ||
|
||
|
||
class ParseExceptionError(Exception): | ||
""" | ||
Parsing failed. | ||
""" | ||
|
||
def __init__(self, message: str, position: int) -> None: | ||
super().__init__(message) | ||
self.position = position | ||
|
||
|
||
DEFAULT_RULES = { | ||
"LPAREN": r"\s*\(", | ||
"RPAREN": r"\s*\)", | ||
"LBRACKET": r"\s*\[", | ||
"RBRACKET": r"\s*\]", | ||
"SEMICOLON": r"\s*;", | ||
"COMMA": r"\s*,", | ||
"QUOTED_STRING": re.compile( | ||
r""" | ||
\s* | ||
( | ||
('[^']*') | ||
| | ||
("[^"]*") | ||
) | ||
""", | ||
re.VERBOSE, | ||
), | ||
"OP": r"\s*(===|==|~=|!=|<=|>=|<|>)", | ||
"BOOLOP": r"\s*(or|and)", | ||
"IN": r"\s*in", | ||
"NOT": r"\s*not", | ||
"VARIABLE": re.compile( | ||
r""" | ||
\s* | ||
( | ||
python_version | ||
|python_full_version | ||
|os[._]name | ||
|sys[._]platform | ||
|platform_(release|system) | ||
|platform[._](version|machine|python_implementation) | ||
|python_implementation | ||
|implementation_(name|version) | ||
|extra | ||
) | ||
""", | ||
re.VERBOSE, | ||
), | ||
"VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE), | ||
"URL_SPEC": r"\s*@ *[^ ]+", | ||
"IDENTIFIER": r"\s*[a-zA-Z0-9._-]+", | ||
} | ||
|
||
|
||
class Tokenizer: | ||
"""Stream of tokens for a LL(1) parser. | ||
Provides methods to examine the next token to be read, and to read it | ||
(advance to the next token). | ||
""" | ||
|
||
next_token: Optional[Token] | ||
|
||
def __init__(self, source: str, rules: Dict[str, object] = DEFAULT_RULES) -> None: | ||
self.source = source | ||
self.rules = {name: re.compile(pattern) for name, pattern in rules.items()} | ||
self.next_token = None | ||
self.generator = self._tokenize() | ||
self.position = 0 | ||
|
||
def peek(self) -> Token: | ||
""" | ||
Return the next token to be read. | ||
""" | ||
if not self.next_token: | ||
self.next_token = next(self.generator) | ||
return self.next_token | ||
|
||
def match(self, *name: str) -> bool: | ||
""" | ||
Return True if the next token matches the given arguments. | ||
""" | ||
token = self.peek() | ||
return token.matches(*name) | ||
|
||
def expect(self, *name: str, error_message: str) -> Token: | ||
""" | ||
Raise SyntaxError if the next token doesn't match given arguments. | ||
""" | ||
token = self.peek() | ||
if not token.matches(*name): | ||
raise self.raise_syntax_error(message=error_message) | ||
return token | ||
|
||
def read(self, *name: str, error_message: str = "") -> Token: | ||
"""Return the next token and advance to the next token. | ||
Raise SyntaxError if the token doesn't match. | ||
""" | ||
result = self.expect(*name, error_message=error_message) | ||
self.next_token = None | ||
return result | ||
|
||
def try_read(self, *name: str) -> Optional[Token]: | ||
"""read() if the next token matches the given arguments. | ||
Do nothing if it does not match. | ||
""" | ||
if self.match(*name): | ||
return self.read() | ||
return None | ||
|
||
def raise_syntax_error(self, *, message: str) -> NoReturn: | ||
""" | ||
Raise SyntaxError at the given position in the marker. | ||
""" | ||
at = f"at position {self.position}:" | ||
marker = " " * self.position + "^" | ||
raise ParseExceptionError( | ||
f"{message}\n{at}\n {self.source}\n {marker}", | ||
self.position, | ||
) | ||
|
||
def _make_token(self, name: str, text: str) -> Token: | ||
""" | ||
Make a token with the current position. | ||
""" | ||
return Token(name, text, self.position) | ||
|
||
def _tokenize(self) -> Generator[Token, Token, None]: | ||
""" | ||
The main generator of tokens. | ||
""" | ||
while self.position < len(self.source): | ||
for name, expression in self.rules.items(): | ||
match = expression.match(self.source, self.position) | ||
if match: | ||
token_text = match[0] | ||
|
||
yield self._make_token(name, token_text.strip()) | ||
self.position += len(token_text) | ||
break | ||
else: | ||
raise self.raise_syntax_error(message="Unrecognized token") | ||
yield self._make_token("END", "") |
Oops, something went wrong.
; 2013",