Skip to content

Commit

Permalink
Handwritten parser for parsing requirements (#484)
Browse files Browse the repository at this point in the history
Co-authored-by: Brett Cannon <brett@python.org>
Co-authored-by: Pradyun Gedam <pradyunsg@gmail.com>
  • Loading branch information
3 people authored Jul 31, 2022
1 parent 2bd5da3 commit 2e5593c
Show file tree
Hide file tree
Showing 8 changed files with 496 additions and 208 deletions.
228 changes: 228 additions & 0 deletions packaging/_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
# The docstring for each parse function contains the grammar for the rule.
# The grammar uses a simple EBNF-inspired syntax:
#
# - Uppercase names are tokens
# - Lowercase names are rules (parsed with a parse_* function)
# - Parentheses are used for grouping
# - A | means either-or
# - A * means 0 or more
# - A + means 1 or more
# - A ? means 0 or 1

from ast import literal_eval
from typing import Any, List, NamedTuple, Tuple, Union

from ._tokenizer import Tokenizer


class Node:
def __init__(self, value: str) -> None:
self.value = value

def __str__(self) -> str:
return self.value

def __repr__(self) -> str:
return f"<{self.__class__.__name__}('{self}')>"

def serialize(self) -> str:
raise NotImplementedError


class Variable(Node):
def serialize(self) -> str:
return str(self)


class Value(Node):
def serialize(self) -> str:
return f'"{self}"'


class Op(Node):
def serialize(self) -> str:
return str(self)


MarkerVar = Union[Variable, Value]
MarkerItem = Tuple[MarkerVar, Op, MarkerVar]
# MarkerAtom = Union[MarkerItem, List["MarkerAtom"]]
# MarkerList = List[Union["MarkerList", MarkerAtom, str]]
# mypy does not suport recursive type definition
# https://github.com/python/mypy/issues/731
MarkerAtom = Any
MarkerList = List[Any]


class Requirement(NamedTuple):
name: str
url: str
extras: List[str]
specifier: str
marker: str


def parse_named_requirement(requirement: str) -> Requirement:
"""
named_requirement:
IDENTIFIER extras (URL_SPEC | specifier) (SEMICOLON marker_expr)? END
"""
tokens = Tokenizer(requirement)
tokens.expect("IDENTIFIER", error_message="Expression must begin with package name")
name = tokens.read("IDENTIFIER").text
extras = parse_extras(tokens)
specifier = ""
url = ""
if tokens.match("URL_SPEC"):
url = tokens.read().text[1:].strip()
elif not tokens.match("END"):
specifier = parse_specifier(tokens)
if tokens.try_read("SEMICOLON"):
marker = ""
while not tokens.match("END"):
# we don't validate markers here, it's done later as part of
# packaging/requirements.py
marker += tokens.read().text
else:
marker = ""
tokens.expect(

This comment was marked as off-topic.

Copy link
@laurenaydyn

laurenaydyn Jul 8, 2023

; 2013",

"END",
error_message="Expected semicolon (followed by markers) or end of string",
)
return Requirement(name, url, extras, specifier, marker)


def parse_extras(tokens: Tokenizer) -> List[str]:
"""
extras: LBRACKET (IDENTIFIER (COMMA IDENTIFIER)*)? RBRACKET
"""
extras = []
if tokens.try_read("LBRACKET"):
while tokens.match("IDENTIFIER"):
extras.append(tokens.read("IDENTIFIER").text)
if not tokens.match("RBRACKET"):
tokens.read("COMMA", error_message="Missing comma after extra")
if not tokens.match("COMMA") and tokens.match("RBRACKET"):
break
tokens.read("RBRACKET", error_message="Closing square bracket is missing")
return extras


def parse_specifier(tokens: Tokenizer) -> str:
"""
specifier:
LPAREN version_many? RPAREN | version_many
"""
lparen = False
if tokens.try_read("LPAREN"):
lparen = True
parsed_specifiers = parse_version_many(tokens)
if lparen and not tokens.try_read("RPAREN"):
tokens.raise_syntax_error(message="Closing right parenthesis is missing")
return parsed_specifiers


def parse_version_many(tokens: Tokenizer) -> str:
"""
version_many: OP VERSION (COMMA OP VERSION)*
"""
parsed_specifiers = ""
while tokens.match("OP"):
parsed_specifiers += tokens.read("OP").text
if tokens.match("VERSION"):
parsed_specifiers += tokens.read("VERSION").text
else:
tokens.raise_syntax_error(message="Missing version")
if not tokens.match("COMMA"):
break
tokens.expect("COMMA", error_message="Missing comma after version")
parsed_specifiers += tokens.read("COMMA").text
return parsed_specifiers


def parse_marker_expr(tokens: Tokenizer) -> MarkerList:
"""
marker_expr: MARKER_ATOM (BOOLOP + MARKER_ATOM)+
"""
expression = [parse_marker_atom(tokens)]
while tokens.match("BOOLOP"):
tok = tokens.read("BOOLOP")
expr_right = parse_marker_atom(tokens)
expression.extend((tok.text, expr_right))
return expression


def parse_marker_atom(tokens: Tokenizer) -> MarkerAtom:
"""
marker_atom: LPAREN marker_expr RPAREN | marker_item
"""
if tokens.try_read("LPAREN"):
marker = parse_marker_expr(tokens)
tokens.read("RPAREN", error_message="Closing right parenthesis is missing")
return marker
else:
return parse_marker_item(tokens)


def parse_marker_item(tokens: Tokenizer) -> MarkerItem:
"""
marker_item: marker_var marker_op marker_var
"""
marker_var_left = parse_marker_var(tokens)
marker_op = parse_marker_op(tokens)
marker_var_right = parse_marker_var(tokens)
return (marker_var_left, marker_op, marker_var_right)


def parse_marker_var(tokens: Tokenizer) -> MarkerVar:
"""
marker_var: env_var | python_str
"""
if tokens.match("VARIABLE"):
return parse_env_var(tokens)
else:
return parse_python_str(tokens)


def parse_env_var(tokens: Tokenizer) -> Variable:
"""
env_var: VARIABLE
"""
env_var = tokens.read("VARIABLE").text.replace(".", "_")
if (
env_var == "platform_python_implementation"
or env_var == "python_implementation"
):
return Variable("platform_python_implementation")
else:
return Variable(env_var)


def parse_python_str(tokens: Tokenizer) -> Value:
"""
python_str: QUOTED_STRING
"""
token = tokens.read(
"QUOTED_STRING",
error_message="String with single or double quote at the beginning is expected",
).text
python_str = literal_eval(token)
return Value(str(python_str))


def parse_marker_op(tokens: Tokenizer) -> Op:
"""
marker_op: IN | NOT IN | OP
"""
if tokens.try_read("IN"):
return Op("in")
elif tokens.try_read("NOT"):
tokens.read("IN", error_message="NOT token must be follewed by IN token")
return Op("not in")
elif tokens.match("OP"):
return Op(tokens.read().text)
else:
return tokens.raise_syntax_error(
message='Couldn\'t parse marker operator. Expecting one of \
"<=, <, !=, ==, >=, >, ~=, ===, not, not in"'
)
164 changes: 164 additions & 0 deletions packaging/_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import re
from typing import Dict, Generator, NoReturn, Optional

from .specifiers import Specifier


class Token:
def __init__(self, name: str, text: str, position: int) -> None:
self.name = name
self.text = text
self.position = position

def matches(self, name: str = "") -> bool:
if name and self.name != name:
return False
return True


class ParseExceptionError(Exception):
"""
Parsing failed.
"""

def __init__(self, message: str, position: int) -> None:
super().__init__(message)
self.position = position


DEFAULT_RULES = {
"LPAREN": r"\s*\(",
"RPAREN": r"\s*\)",
"LBRACKET": r"\s*\[",
"RBRACKET": r"\s*\]",
"SEMICOLON": r"\s*;",
"COMMA": r"\s*,",
"QUOTED_STRING": re.compile(
r"""
\s*
(
('[^']*')
|
("[^"]*")
)
""",
re.VERBOSE,
),
"OP": r"\s*(===|==|~=|!=|<=|>=|<|>)",
"BOOLOP": r"\s*(or|and)",
"IN": r"\s*in",
"NOT": r"\s*not",
"VARIABLE": re.compile(
r"""
\s*
(
python_version
|python_full_version
|os[._]name
|sys[._]platform
|platform_(release|system)
|platform[._](version|machine|python_implementation)
|python_implementation
|implementation_(name|version)
|extra
)
""",
re.VERBOSE,
),
"VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE),
"URL_SPEC": r"\s*@ *[^ ]+",
"IDENTIFIER": r"\s*[a-zA-Z0-9._-]+",
}


class Tokenizer:
"""Stream of tokens for a LL(1) parser.
Provides methods to examine the next token to be read, and to read it
(advance to the next token).
"""

next_token: Optional[Token]

def __init__(self, source: str, rules: Dict[str, object] = DEFAULT_RULES) -> None:
self.source = source
self.rules = {name: re.compile(pattern) for name, pattern in rules.items()}
self.next_token = None
self.generator = self._tokenize()
self.position = 0

def peek(self) -> Token:
"""
Return the next token to be read.
"""
if not self.next_token:
self.next_token = next(self.generator)
return self.next_token

def match(self, *name: str) -> bool:
"""
Return True if the next token matches the given arguments.
"""
token = self.peek()
return token.matches(*name)

def expect(self, *name: str, error_message: str) -> Token:
"""
Raise SyntaxError if the next token doesn't match given arguments.
"""
token = self.peek()
if not token.matches(*name):
raise self.raise_syntax_error(message=error_message)
return token

def read(self, *name: str, error_message: str = "") -> Token:
"""Return the next token and advance to the next token.
Raise SyntaxError if the token doesn't match.
"""
result = self.expect(*name, error_message=error_message)
self.next_token = None
return result

def try_read(self, *name: str) -> Optional[Token]:
"""read() if the next token matches the given arguments.
Do nothing if it does not match.
"""
if self.match(*name):
return self.read()
return None

def raise_syntax_error(self, *, message: str) -> NoReturn:
"""
Raise SyntaxError at the given position in the marker.
"""
at = f"at position {self.position}:"
marker = " " * self.position + "^"
raise ParseExceptionError(
f"{message}\n{at}\n {self.source}\n {marker}",
self.position,
)

def _make_token(self, name: str, text: str) -> Token:
"""
Make a token with the current position.
"""
return Token(name, text, self.position)

def _tokenize(self) -> Generator[Token, Token, None]:
"""
The main generator of tokens.
"""
while self.position < len(self.source):
for name, expression in self.rules.items():
match = expression.match(self.source, self.position)
if match:
token_text = match[0]

yield self._make_token(name, token_text.strip())
self.position += len(token_text)
break
else:
raise self.raise_syntax_error(message="Unrecognized token")
yield self._make_token("END", "")
Loading

0 comments on commit 2e5593c

Please sign in to comment.