Skip to content

Commit

Permalink
Merge pull request #70 from anachronauts/multiline-restream
Browse files Browse the repository at this point in the history
Teach lexer to match token regexes over multiple lines
  • Loading branch information
jdpage authored Sep 18, 2018
2 parents 032dea1 + 2791f1f commit 87e4892
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 56 deletions.
20 changes: 20 additions & 0 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion jeff65/blum/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import re
import regex as re
import struct
from . import symbol, types

Expand Down
24 changes: 11 additions & 13 deletions jeff65/gold/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,23 +47,21 @@ def open_unit(unit):


def parse(fileobj, name):
stream = parsing.ReStream(fileobj)
tree = grammar.parse(
stream, grammar.lex,
lambda t, s, c, m: ast.AstNode(t, span=s, attrs={
f"{k:02}": v for k, v in enumerate(c)
}))
# if parser._syntaxErrors > 0:
# raise ast.ParseError("Unit {} had errors; terminating".format(name))
with parsing.ReStream(fileobj) as stream:
tree = grammar.parse(
stream, grammar.lex,
lambda t, s, c, m: ast.AstNode(t, span=s, attrs={
f"{k:02}": v for k, v in enumerate(c)
}))
return tree.transform(simplify.Simplify())


def translate(unit):
with open_unit(unit) as input_file:
obj = parse(input_file, name=unit.name)
for p in passes:
obj = obj.transform(p())
logger.debug(__("Pass {}:\n{:p}", p.__name__, obj))
# parse will close the file for us
obj = parse(open_unit(unit), name=unit.name)
for p in passes:
obj = obj.transform(p())
logger.debug(__("Pass {}:\n{:p}", p.__name__, obj))

archive = blum.Archive()
for node in obj.select("toplevels", "stmt"):
Expand Down
24 changes: 9 additions & 15 deletions jeff65/gold/grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import enum
import re
import regex as re
from ..parsing import Grammar, Lexer, Parser, ReStream, Rule


Expand Down Expand Up @@ -161,27 +161,19 @@ def _w(word):
(Mode.COMMENT, re.escape('/*'), T.COMMENT_OPEN, ReStream.CHANNEL_HIDDEN),
(Mode.COMMENT, re.escape('*/'), T.COMMENT_CLOSE, ReStream.CHANNEL_HIDDEN),

# This is necessary because the next pattern matches up to, but not
# including, the newline; however, it will happily match zero characters,
# causing an infinite loop. This matches that last newline.
(Mode.COMMENT, r'\n', T.COMMENT_TEXT, ReStream.CHANNEL_HIDDEN),

# Matches either to the next comment-control token, or the end of the line,
# whichever happens first.
(Mode.COMMENT, r'.*?(?=\/\*|\*\/|$)', T.COMMENT_TEXT,
# Matches up to the next comment-control token.
(Mode.COMMENT, r'(?s).*?(?=\/\*|\*\/)', T.COMMENT_TEXT,
ReStream.CHANNEL_HIDDEN),

# String delimiter. When the lexer comes back, it will be in string mode
(re.escape('"'), T.STRING_DELIM),

# String control tokens
(Mode.STRING, r'\\.', T.STRING_ESCAPE),
(Mode.STRING, r'(?s)\\.', T.STRING_ESCAPE),
(Mode.STRING, re.escape('"'), T.STRING_DELIM),

# Matches non-special text inside a string. The newline-matching pattern is
# for the same reason as for comments.
(Mode.STRING, r'\n', T.STRING),
(Mode.STRING, r'.*?(?=\\|"|$)', T.STRING),
# Matches non-special text inside a string.
(Mode.STRING, r'(?s).*?(?=\\|")', T.STRING),

# operators & punctuation. These must be ordered such that if A is a prefix
# of B, then B comes before A. The easiest way to do this is to order them
Expand Down Expand Up @@ -216,7 +208,9 @@ def _w(word):
(re.escape('}'), T.BRACE_CLOSE),

# If we fail to match anything, consume one character, and move on.
(r'.', T.MYSTERY),
(r'(?s).', T.MYSTERY),
(Mode.STRING, r'(?s).', T.MYSTERY),
(Mode.COMMENT, r'(?s).', T.MYSTERY, ReStream.CHANNEL_HIDDEN),
])


Expand Down
130 changes: 103 additions & 27 deletions jeff65/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import attr
import io
import logging
import re
import regex as re
import time
from collections import deque
from itertools import chain

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -93,32 +95,76 @@ class ReStream:
CHANNEL_DEFAULT = 0
CHANNEL_HIDDEN = 1

def __init__(self, stream):
self.stream = iter(stream)
self.current = None
self.line = 0
def __init__(self, stream, encoding='utf8', blocksize=4096):
self.current = deque()
self.position = 0
self.bufsize = 0
self.line = 1 # the number of the actual current-position line
self.column = 0

try:
self.advance_line()
except StopIteration:
self.current = ""
self.line = 1
# TODO: If we got a buffered stream, it'd be cool to pull the buffer
# size out of it directly, since e.g. file streams will use the block
# size of the underlying storage automatically.
self.blocksize = blocksize

if isinstance(stream, io.StringIO):
# Since everything is already in-memory anyway, just bypass the
# stream API.
self.encoding = None
self.bstream = None
self.current.append(stream.getvalue())
self.bufsize = len(self.current[0])
elif isinstance(stream, io.TextIOBase):
self.encoding = stream.encoding
self.bstream = stream.detach()
else:
self.encoding = encoding
self.bstream = stream

def advance_line(self):
"""Advance the stream position to the beginning of the next line."""
try:
self.current = next(self.stream)
self.extend_buffer()
except StopIteration:
raise
else:
self.line += 1
self.column = 0
pass

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.close()

def close(self):
if self.bstream is not None:
self.bstream.close()
self.bstream = None

def extend_buffer(self):
"""Extends the current buffer to include an additional block."""

if self.bstream is None:
raise StopIteration

block = self.bstream.read(self.blocksize)
if len(block) == 0:
raise StopIteration

def assure_line(self):
"""Assures that at least one character remains in the current line."""
if self.column == len(self.current):
self.advance_line()
text = block.decode(self.encoding)
self.current.append(text)
self.bufsize += len(text)

def trim_buffer(self):
"""Trims already-consumed blocks from the buffer."""

while len(self.current) > 0 and self.position > len(self.current[0]):
count = len(self.current.popleft())
self.position -= count
self.bufsize -= count

def assure_buffer(self):
"""Assures that at least one character remains in the buffer."""

while self.position >= self.bufsize:
self.extend_buffer()
self.trim_buffer()

def match(self, regex):
"""Match the given regex at the current stream position.
Expand All @@ -129,17 +175,45 @@ def match(self, regex):
Returns a match object if successful, None otherwise.
"""

self.assure_line()
return regex.match(self.current, self.column)
# The approach taken here, of reattempting partial matches with longer
# and longer inputs, leaves something to be desired. In particular, it
# means that multiline matches technically run in O(n^2) time. In
# practice, unless you have megabytes of comments (or strings), it
# shouldn't be an issue.

self.assure_buffer()
assert self.position < len(self.current[0]) # else buf expr is wrong
while True:
buf = "".join(self.current)
m = regex.match(buf, self.position, partial=True)
if not m or not m.partial:
return m

try:
self.extend_buffer()
except StopIteration:
# EOF, so see if the partial match is valid as a full match.
return regex.match(buf, self.position)

def produce(self, symbol, match, channel=CHANNEL_DEFAULT):
"""Produce a token and advance the position."""

token = Token(symbol, match.group(), channel,
text = match.group()
end_line = self.line + text.count("\n")
if end_line > self.line:
end_column = len(text) - text.rindex("\n") - 1
else:
end_column = self.column + len(text)

token = Token(symbol, text, channel,
TextSpan(
self.line, self.column,
self.line, match.end()))
self.column = match.end()
end_line, end_column))
self.position = match.end()
self.line = end_line
self.column = end_column

# note that we don't trim the buffer, in case a rewind is needed.
return token

def rewind(self, token: Token):
Expand All @@ -150,7 +224,9 @@ def rewind(self, token: Token):
"""

assert token.span.end == (self.line, self.column)
self.line = token.span.start_line
self.column = token.span.start_column
self.position -= len(token.text)

def produce_eof(self, symbol):
"""Produce an EOF token."""
Expand Down Expand Up @@ -183,7 +259,7 @@ def __init__(self, eof, rules):

def __call__(self, stream: ReStream, mode: int) -> Token:
try:
stream.assure_line()
stream.assure_buffer()
except StopIteration:
return stream.produce_eof(self.eof)

Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ classifiers =
packages = find:
install_requires =
attrs>=18.1.0
regex>=2018.08.29
python_requires = >=3.6

[flake8]
Expand Down

0 comments on commit 87e4892

Please sign in to comment.