fix bug with comments in multiline annotated assignments

abarker · Jul 28, 2021 · 489d77b · 489d77b
1 parent fc4ab22
commit 489d77b
Show file tree

Hide file tree

Showing 8 changed files with 123 additions and 46 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,13 @@ Changelog
 
 Added the options ``--outfile`` and ``--inplace``, along with ``--help``.
 
+Bug fixes:
+
+* Fixed a bug where comments in function return hints that contain both newlines
+  and comments cause syntax error when the ``--strip-nl`` option is used.
+
+* Fixed a bug in multiline annotated expressions with comments on the lines.
+
 0.1.9 (2020-05-06)
 ------------------
 

diff --git a/README.rst b/README.rst
@@ -11,10 +11,10 @@ processed code file also correspond to those for the original code file.  In
 most cases, with the default options, both the line and column numbers are
 preserved.
 
-The stripping operation can be used as a preprocessor to allow the new type
-hint syntax to be used in Python 2.  The main intended application is for code
-which is being developed in Python 3 but which needs backward compatibility to
-Python 2.
+This program was originally written to strip hints from Python 2 code to allow
+for developing such code on Python 3 but running it with Python 2.  Since
+Python 2 is no longer maintained the program is no longer tested when run with
+Python 2 (but it probably still works).
 
 This project also contains a general-purpose class named ``TokenList`` which
 allows lists of Python tokens to be operated on using an interface similar to
@@ -71,28 +71,28 @@ The command-line options are as follows:
    Do not parse the resulting code with the Python ``ast`` module to check it.
    Default is false.
 
+``--only-assigns-and-defs``
+   Only strip annotated assignments and standalone type definitions, keeping
+   function signature annotations.  Python 3.5 and earlier do not implement
+   these; they first appeared in Python 3.6.  The default is false.
+
+``--only-test-for-changes``
+   Only test if any changes would be made.  If any stripping would be done then
+   it prints ``True`` and exits with code 0.  Otherwise it prints ``False`` and
+   exits with code 1.
+
 ``--no-colon-move``
    Do not move colons to fix line breaks that occur in the hints for the
    function return type.  Default is false.  See the Limitations section below
-   for more information.
+   for more information.  Not recommended.
 
 ``--no-equal-move``
    Do not move the assignment with ``=`` when needed to fix annotated
    assignments that include newlines in the type hints.  When they are moved
    the total number of lines is kept the same in order to preserve line number
    correspondence between the stripped and non-stripped files.  If this option
    is selected and such a situation occurs an exception is raised.  See the
-   Limitations section below for more information.
-
-``--only-assigns-and-defs``
-   Only strip annotated assignments and standalone type definitions, keeping
-   function signature annotations.  Python 3.5 and earlier do not implement
-   these; they first appeared in Python 3.6.  The default is false.
-
-``--only-test-for-changes``
-   Only test if any changes would be made.  If any stripping would be done then
-   it prints ``True`` and exits with code 0.  Otherwise it prints ``False`` and
-   exits with code 1.
+   Limitations section below for more information.  Not recommended.
 
 If you are using the development repo you can just run the file
 ``strip_hints.py`` in the ``bin`` directory of the repo::
@@ -162,12 +162,13 @@ hint in an annotated assignment:
    x: List[int,
            int] = [1,2]
 
-The program currently handles this by moving the line with ``=`` (and the
-following lines) to the end of the line with ``x``.  Empty lines are added to
-the end to keep to total number of lines the same.  The ``--no-equal-move``
-argument turns this off, in which case situations like those above raise
-exceptions.  (As a workaround if necessary with ``--no-equal-move``, using an
-explicit backslash line continuation seems to work.)
+The program currently handles this by removing the newlines up to the ``=``
+sign.  Any comments on those lines are also stripped, since otherwise they
+cause syntax errors.  Empty lines are added to the end to keep to total number
+of lines the same.  The ``--no-equal-move`` argument turns this off, in which
+case situations like those above raise exceptions.  (As a workaround if
+necessary to use ``--no-equal-move``, using an explicit backslash line
+continuation seems to work.)
 
 A similar situation can occur in return type specifications:
 

diff --git a/src/strip_hints/strip_hints_main.py b/src/strip_hints/strip_hints_main.py
@@ -93,7 +93,8 @@
    method as was used for individual function parameters.  If it is only a type
    definition (without an assignment) then turn it into a comment by changing the
    first character to pound sign.  Disallow `NL` tokens in whited-out code.
-   (TODO: update description for current code handling simple annotated expressions.)
+   If an annotated assignment then comments in the typedef are also whited out
+   to avoid the syntax errors they cause.
 
 The algorithm only handles simple annotated expressions in step 3 that start
 with a name, e.g., not ones like `(x) : int`.
@@ -151,9 +152,9 @@ def __init__(self, to_empty, strip_nl, no_ast,  no_colon_move, no_equal_move,
 
     def check_whited_out_line_breaks(self, token_list, rpar_and_colon=None):
         """Check that a `TokenList` instance to be whited-out does not include a
-        newline (since it would no longer be nested and valid).  This routine
-        also moves the colon if `rpar_and_colon` is passed a token list and
-        the `--no-colon-move` option is not selected."""
+        newline (since the newlines would no longer be nested).  This routine
+        also moves the colon if `rpar_and_colon` is passed the corresponding token
+        # list (except if the `--no-colon-move` option is selected)."""
         # Note this check is used
         #    1) when the `--no-equal-move` option is selected
         #       to check annotated assignments
@@ -167,8 +168,14 @@ def check_whited_out_line_breaks(self, token_list, rpar_and_colon=None):
         # the "\" except implicitly in the start and end component numbers.
         # For some info, see these issues with backslash in untokenize and the two
         # distinct modes: https://bugs.python.org/issue12691
+        #
+        # Annotated expressions with assignment are different because you cannot
+        # just move the = sign like the colon (it has an expression after it).
+
         if self.strip_nl:
-            return # NL tokens will be set to empty strings, so no problem.
+            if not any(t.type_name == "COMMENT" for t in token_list):
+                return # NL tokens will be set to empty strings, OK with no comments.
+
         moved_colon = False
         for t in token_list:
             if t.type_name == "NL":
@@ -188,20 +195,16 @@ def process_single_parameter(self, parameter, nesting_level, annassign=False):
         """Process a single parameter in a function definition.  Setting `annassign`
         makes slight changes to instead handle annotated assignments."""
 
-        # First do exactly one split on colon or equal sign.
+        #
+        # First do exactly one split on the first colon or equal sign.
+        #
 
         split_on_colon_or_equal, splits = parameter.split(token_values=":=",
                                                           only_nestlevel=nesting_level,
                                                           sep_on_left=False, max_split=1,
                                                           return_splits=True)
         if len(split_on_colon_or_equal) == 1: # Just a variable name, no type or annotation.
-            if annassign:
-                # TODO: This code condition can never be reached.  Make sure and then delete.
-                self.check_whited_out_line_breaks(split_on_colon_or_equal[0])
-                for t in split_on_colon_or_equal[0]:
-                    t.to_whitespace(empty=self.to_empty, strip_nl=self.strip_nl)
             return
-
         assert len(split_on_colon_or_equal) == 2
         assert len(splits) == 1
 
@@ -210,13 +213,16 @@ def process_single_parameter(self, parameter, nesting_level, annassign=False):
         if splits[0].string == "=":
             return # Parameter is just a variable with a regular default value.
 
-        # At this point we found an annotation.
-        # Now do exactly one split the right part on equal.
+        #
+        # At this point we have an annotation (maybe with assignment) or a
+        # typed parameter (maybe with default).  Now do exactly one split of
+        # the right part, on equal, to test for an assignment or default value.
+        #
 
         split_on_equal = right_part.split(token_values="=",
                                           only_nestlevel=nesting_level,
                                           max_split=1, sep_on_left=False)
-        if len(split_on_equal) == 1: # Got a type def, no assignment or default.
+        if len(split_on_equal) == 1: # Got a type def, no assignment or default value.
             if annassign: # Make into a comment (if not a fun parameter).
                 skip_set = ignored_types_set.copy()
                 skip_set.remove(tokenize.NL)
@@ -233,21 +239,30 @@ def process_single_parameter(self, parameter, nesting_level, annassign=False):
                         first_non_nl_token = True
                 return
 
+        #
+        # At this point we have an annotated assignment or typed param with default.
+        #
+
         type_def = split_on_equal[0]
+        has_assignment = len(split_on_equal) > 1
 
         if annassign and self.no_equal_move:
             self.check_whited_out_line_breaks(type_def)
 
+        # Loop through counting newlines while whiting-out the type def part.
         nl_count = 0
         for t in type_def:
             if t.type == tokenize.NL:
                 nl_count += 1
             strip_nl = self.strip_nl
+            strip_comments = False
             if annassign and not self.no_equal_move:
-                strip_nl = True
-            t.to_whitespace(empty=self.to_empty, strip_nl=strip_nl)
+                strip_nl = True # Use strip_nl to move annassign's `= ...` part (if needed).
+                strip_comments = True # Comments can cause syntax errors with strip_nl.
+            t.to_whitespace(empty=self.to_empty,
+                            strip_nl=strip_nl, strip_comments=strip_comments)
 
-        has_assignment = len(split_on_equal) > 1
+        # Replace any stripped newlines so line numbers match after the change.
         if annassign and not self.no_equal_move and has_assignment:
             split_on_equal[1][-1].string += "\n" * nl_count
 
@@ -273,16 +288,18 @@ def process_parameters(self, parameters, nesting_level):
                                               nesting_level=nesting_level)
                 prev_comma_plus_one = count + 1
 
-
     def process_return_part(self, return_part, rpar_token):
         """Process the return part of the function definition (which may just be a
         colon if no `->` is used."""
+        colon_token = None
         if not return_part:
             return # Error condition, but ignore.
         for i in reversed(range(len(return_part))):
             if return_part[i].string == ":":
                 colon_token = return_part[i]
                 break
+        if colon_token is None:
+            raise StripHintsException("Error: colon_token not set in process_return_part.")
         return_type_spec = return_part[:i]
         self.check_whited_out_line_breaks(return_type_spec,
                                      rpar_and_colon=(rpar_token, colon_token))

diff --git a/src/strip_hints/token_list.py b/src/strip_hints/token_list.py
@@ -73,8 +73,9 @@ class Token(object):
     """Represents a token from the Python tokenizer.
 
     The tokens are mutable via changing the named attributes such as `type` and
-    `string`.  Accessing the `token_tuple` property returns a tuple of the
-    current values (the format Python's `untokenize` function expects)."""
+    `string`.  Accessing the `token_tuple` property or converting to type `tuple`
+    returns a tuple of the current values (the format Python's `untokenize`
+    function expects)."""
 
     def __init__(self, token_iterable, nesting_level=None, filename=None,
                  compat_mode=False):
@@ -120,7 +121,7 @@ def token_tuple(self):
             current_tuple = (self.type, self.string, self.start, self.end, self.line)
         return current_tuple
 
-    def to_whitespace(self, empty=False, strip_nl=False):
+    def to_whitespace(self, empty=False, strip_nl=False, strip_comments=False):
         """Convert the string or value of the token to a string of spaces of
         the same length as the original string.
 
@@ -131,13 +132,19 @@ def to_whitespace(self, empty=False, strip_nl=False):
         if strip_nl and self.type == tokenize.NL:
             self.string = ""
             return
+        if strip_comments and self.type == tokenize.COMMENT:
+            self.string = ""
+            return
         if self.type in ignored_types_set:
             return
         if empty:
             self.string = ""
         else:
             self.string = " " * len(self.token_tuple[1])
 
+    def __tuple__(self):
+        return self.token_tuple
+
     def __repr__(self):
         return self.simple_repr()
         # The __repr__ is used when lists of tokens are printed, and below is
@@ -270,7 +277,7 @@ def read_from_readline_interface(self, readline, filename=None, compat_mode=Fals
                            filename=filename, compat_mode=self.compat_mode))
 
     def untokenize(self, encoding=None):
-        """Convert the current list of tokens into code and return the code.
+        """Convert the current list of tokens into a code string and return it.
         If no `encoding` is supplied the one stored with the list from when
         it was created is used."""
         if not encoding:
@@ -282,7 +289,7 @@ def untokenize(self, encoding=None):
         result = tokenize.untokenize(token_tuples)
         if version == 2: # Decoding below causes a unicode error in Python 2.
             return result
-        decoded_result = result.decode(encoding)
+        decoded_result = result if isinstance(result, str) else result.decode(encoding)
         return decoded_result
 
     def iter_with_skips(self, skip_types=None, skip_type_names=None, skip_values=None):

diff --git a/test/run_importer.py b/test/run_importer.py
diff --git a/test/run_strip_from_string_test.py b/test/run_strip_from_string_test.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 
 Usage: strip_from_string_test.py <filename>

diff --git a/test/test_NL_in_annotated_assigns_and_decls.py b/test/test_NL_in_annotated_assigns_and_decls.py
@@ -69,3 +69,25 @@ def f():
     # f 2
 
     pass
+
+#
+# Below tests comments in the new lines, which cause their own problems.
+#
+
+# Note: this one below fails with --no-colon-move option (as it should).
+def egg(x:int) -> List[ # function
+
+        int,  # first arg
+        float # second arg
+
+        # another comment
+        ] :
+    pass
+
+d['c']: List[ # This works, but adds all the whitespace before the `= 0`
+        # Another comment.
+
+        int,  # part on the first line (the `=` is moved up).Q
+
+        int] = a[ 4 ,5]
+
diff --git a/test/test_NL_in_annotated_assigns_and_decls.py.results b/test/test_NL_in_annotated_assigns_and_decls.py.results
@@ -69,3 +69,25 @@ def f():
     # f 2
 
     pass
+
+#
+# Below tests comments in the new lines, which cause their own problems.
+#
+
+# Note: this one below fails with --no-colon-move option (as it should).
+def egg(x    ):          # function
+
+              # first arg
+              # second arg
+
+        # another comment
+
+    pass
+
+d['c']                                           = a[ 4 ,5]
+
+
+
+
+
+