Merge pull request #70 from anachronauts/multiline-restream

Teach lexer to match token regexes over multiple lines
anachronauts · Sep 18, 2018 · 87e4892 · 87e4892
2 parents 032dea1 + 2791f1f
commit 87e4892
Show file tree

Hide file tree

Showing 6 changed files with 145 additions and 56 deletions.
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/jeff65/blum/image.py b/jeff65/blum/image.py
@@ -14,7 +14,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-import re
+import regex as re
 import struct
 from . import symbol, types
 

diff --git a/jeff65/gold/compiler.py b/jeff65/gold/compiler.py
@@ -47,23 +47,21 @@ def open_unit(unit):
 
 
 def parse(fileobj, name):
-    stream = parsing.ReStream(fileobj)
-    tree = grammar.parse(
-        stream, grammar.lex,
-        lambda t, s, c, m: ast.AstNode(t, span=s, attrs={
-            f"{k:02}": v for k, v in enumerate(c)
-        }))
-    # if parser._syntaxErrors > 0:
-    #     raise ast.ParseError("Unit {} had errors; terminating".format(name))
+    with parsing.ReStream(fileobj) as stream:
+        tree = grammar.parse(
+            stream, grammar.lex,
+            lambda t, s, c, m: ast.AstNode(t, span=s, attrs={
+                f"{k:02}": v for k, v in enumerate(c)
+            }))
     return tree.transform(simplify.Simplify())
 
 
 def translate(unit):
-    with open_unit(unit) as input_file:
-        obj = parse(input_file, name=unit.name)
-        for p in passes:
-            obj = obj.transform(p())
-            logger.debug(__("Pass {}:\n{:p}", p.__name__, obj))
+    # parse will close the file for us
+    obj = parse(open_unit(unit), name=unit.name)
+    for p in passes:
+        obj = obj.transform(p())
+        logger.debug(__("Pass {}:\n{:p}", p.__name__, obj))
 
     archive = blum.Archive()
     for node in obj.select("toplevels", "stmt"):

diff --git a/jeff65/gold/grammar.py b/jeff65/gold/grammar.py
@@ -15,7 +15,7 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import enum
-import re
+import regex as re
 from ..parsing import Grammar, Lexer, Parser, ReStream, Rule
 
 
@@ -161,27 +161,19 @@ def _w(word):
     (Mode.COMMENT, re.escape('/*'), T.COMMENT_OPEN, ReStream.CHANNEL_HIDDEN),
     (Mode.COMMENT, re.escape('*/'), T.COMMENT_CLOSE, ReStream.CHANNEL_HIDDEN),
 
-    # This is necessary because the next pattern matches up to, but not
-    # including, the newline; however, it will happily match zero characters,
-    # causing an infinite loop. This matches that last newline.
-    (Mode.COMMENT, r'\n', T.COMMENT_TEXT, ReStream.CHANNEL_HIDDEN),
-
-    # Matches either to the next comment-control token, or the end of the line,
-    # whichever happens first.
-    (Mode.COMMENT, r'.*?(?=\/\*|\*\/|$)', T.COMMENT_TEXT,
+    # Matches up to the next comment-control token.
+    (Mode.COMMENT, r'(?s).*?(?=\/\*|\*\/)', T.COMMENT_TEXT,
      ReStream.CHANNEL_HIDDEN),
 
     # String delimiter. When the lexer comes back, it will be in string mode
     (re.escape('"'), T.STRING_DELIM),
 
     # String control tokens
-    (Mode.STRING, r'\\.', T.STRING_ESCAPE),
+    (Mode.STRING, r'(?s)\\.', T.STRING_ESCAPE),
     (Mode.STRING, re.escape('"'), T.STRING_DELIM),
 
-    # Matches non-special text inside a string. The newline-matching pattern is
-    # for the same reason as for comments.
-    (Mode.STRING, r'\n', T.STRING),
-    (Mode.STRING, r'.*?(?=\\|"|$)', T.STRING),
+    # Matches non-special text inside a string.
+    (Mode.STRING, r'(?s).*?(?=\\|")', T.STRING),
 
     # operators & punctuation. These must be ordered such that if A is a prefix
     # of B, then B comes before A. The easiest way to do this is to order them
@@ -216,7 +208,9 @@ def _w(word):
     (re.escape('}'), T.BRACE_CLOSE),
 
     # If we fail to match anything, consume one character, and move on.
-    (r'.', T.MYSTERY),
+    (r'(?s).', T.MYSTERY),
+    (Mode.STRING, r'(?s).', T.MYSTERY),
+    (Mode.COMMENT, r'(?s).', T.MYSTERY, ReStream.CHANNEL_HIDDEN),
 ])
 
 

diff --git a/jeff65/parsing.py b/jeff65/parsing.py
@@ -15,9 +15,11 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 import attr
+import io
 import logging
-import re
+import regex as re
 import time
+from collections import deque
 from itertools import chain
 
 logger = logging.getLogger(__name__)
@@ -93,32 +95,76 @@ class ReStream:
     CHANNEL_DEFAULT = 0
     CHANNEL_HIDDEN = 1
 
-    def __init__(self, stream):
-        self.stream = iter(stream)
-        self.current = None
-        self.line = 0
+    def __init__(self, stream, encoding='utf8', blocksize=4096):
+        self.current = deque()
+        self.position = 0
+        self.bufsize = 0
+        self.line = 1  # the number of the actual current-position line
         self.column = 0
 
-        try:
-            self.advance_line()
-        except StopIteration:
-            self.current = ""
-            self.line = 1
+        # TODO: If we got a buffered stream, it'd be cool to pull the buffer
+        # size out of it directly, since e.g. file streams will use the block
+        # size of the underlying storage automatically.
+        self.blocksize = blocksize
+
+        if isinstance(stream, io.StringIO):
+            # Since everything is already in-memory anyway, just bypass the
+            # stream API.
+            self.encoding = None
+            self.bstream = None
+            self.current.append(stream.getvalue())
+            self.bufsize = len(self.current[0])
+        elif isinstance(stream, io.TextIOBase):
+            self.encoding = stream.encoding
+            self.bstream = stream.detach()
+        else:
+            self.encoding = encoding
+            self.bstream = stream
 
-    def advance_line(self):
-        """Advance the stream position to the beginning of the next line."""
         try:
-            self.current = next(self.stream)
+            self.extend_buffer()
         except StopIteration:
-            raise
-        else:
-            self.line += 1
-            self.column = 0
+            pass
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def close(self):
+        if self.bstream is not None:
+            self.bstream.close()
+            self.bstream = None
+
+    def extend_buffer(self):
+        """Extends the current buffer to include an additional block."""
+
+        if self.bstream is None:
+            raise StopIteration
+
+        block = self.bstream.read(self.blocksize)
+        if len(block) == 0:
+            raise StopIteration
 
-    def assure_line(self):
-        """Assures that at least one character remains in the current line."""
-        if self.column == len(self.current):
-            self.advance_line()
+        text = block.decode(self.encoding)
+        self.current.append(text)
+        self.bufsize += len(text)
+
+    def trim_buffer(self):
+        """Trims already-consumed blocks from the buffer."""
+
+        while len(self.current) > 0 and self.position > len(self.current[0]):
+            count = len(self.current.popleft())
+            self.position -= count
+            self.bufsize -= count
+
+    def assure_buffer(self):
+        """Assures that at least one character remains in the buffer."""
+
+        while self.position >= self.bufsize:
+            self.extend_buffer()
+        self.trim_buffer()
 
     def match(self, regex):
         """Match the given regex at the current stream position.
@@ -129,17 +175,45 @@ def match(self, regex):
         Returns a match object if successful, None otherwise.
         """
 
-        self.assure_line()
-        return regex.match(self.current, self.column)
+        # The approach taken here, of reattempting partial matches with longer
+        # and longer inputs, leaves something to be desired. In particular, it
+        # means that multiline matches technically run in O(n^2) time. In
+        # practice, unless you have megabytes of comments (or strings), it
+        # shouldn't be an issue.
+
+        self.assure_buffer()
+        assert self.position < len(self.current[0])  # else buf expr is wrong
+        while True:
+            buf = "".join(self.current)
+            m = regex.match(buf, self.position, partial=True)
+            if not m or not m.partial:
+                return m
+
+            try:
+                self.extend_buffer()
+            except StopIteration:
+                # EOF, so see if the partial match is valid as a full match.
+                return regex.match(buf, self.position)
 
     def produce(self, symbol, match, channel=CHANNEL_DEFAULT):
         """Produce a token and advance the position."""
 
-        token = Token(symbol, match.group(), channel,
+        text = match.group()
+        end_line = self.line + text.count("\n")
+        if end_line > self.line:
+            end_column = len(text) - text.rindex("\n") - 1
+        else:
+            end_column = self.column + len(text)
+
+        token = Token(symbol, text, channel,
                       TextSpan(
                           self.line, self.column,
-                          self.line, match.end()))
-        self.column = match.end()
+                          end_line, end_column))
+        self.position = match.end()
+        self.line = end_line
+        self.column = end_column
+
+        # note that we don't trim the buffer, in case a rewind is needed.
         return token
 
     def rewind(self, token: Token):
@@ -150,7 +224,9 @@ def rewind(self, token: Token):
         """
 
         assert token.span.end == (self.line, self.column)
+        self.line = token.span.start_line
         self.column = token.span.start_column
+        self.position -= len(token.text)
 
     def produce_eof(self, symbol):
         """Produce an EOF token."""
@@ -183,7 +259,7 @@ def __init__(self, eof, rules):
 
     def __call__(self, stream: ReStream, mode: int) -> Token:
         try:
-            stream.assure_line()
+            stream.assure_buffer()
         except StopIteration:
             return stream.produce_eof(self.eof)
 

diff --git a/setup.cfg b/setup.cfg
@@ -16,6 +16,7 @@ classifiers =
 packages = find:
 install_requires =
   attrs>=18.1.0
+  regex>=2018.08.29
 python_requires = >=3.6
 
 [flake8]