Skip to content

Commit fc354f0

Browse files
albertjanvstinner
authored andcommitted
bpo-25324: copy tok_name before changing it (python#1608)
* add test to check if were modifying token * copy list so import tokenize doesnt have side effects on token * shorten line * add tokenize tokens to token.h to get them to show up in token * move ERRORTOKEN back to its previous location, and fix nitpick * copy comments from token.h automatically * fix whitespace and make more pythonic * change to fix comments from @Haypo * update token.rst and Misc/NEWS * change wording * some more wording changes
1 parent 85aba23 commit fc354f0

File tree

7 files changed

+52
-21
lines changed

7 files changed

+52
-21
lines changed

Diff for: Doc/library/token.rst

+8
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,18 @@ The token constants are:
101101
AWAIT
102102
ASYNC
103103
ERRORTOKEN
104+
COMMENT
105+
NL
106+
ENCODING
104107
N_TOKENS
105108
NT_OFFSET
106109

107110
.. versionchanged:: 3.5
108111
Added :data:`AWAIT` and :data:`ASYNC` tokens. Starting with
109112
Python 3.7, "async" and "await" will be tokenized as :data:`NAME`
110113
tokens, and :data:`AWAIT` and :data:`ASYNC` will be removed.
114+
115+
.. versionchanged:: 3.7
116+
Added :data:`COMMENT`, :data:`NL` and :data:`ENCODING` to bring
117+
the tokens in the C code in line with the tokens needed in
118+
:mod:`tokenize` module. These tokens aren't used by the C tokenizer.

Diff for: Include/token.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,11 @@ extern "C" {
6767
#define AWAIT 54
6868
#define ASYNC 55
6969
#define ERRORTOKEN 56
70-
#define N_TOKENS 57
70+
/* These aren't used by the C tokenizer but are needed for tokenize.py */
71+
#define COMMENT 57
72+
#define NL 58
73+
#define ENCODING 59
74+
#define N_TOKENS 60
7175

7276
/* Special definitions for cooperation with parser */
7377

Diff for: Lib/test/test_tokenize.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -1343,13 +1343,13 @@ def assertExactTypeEqual(self, opstr, *optypes):
13431343
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
13441344
num_optypes = len(optypes)
13451345
self.assertEqual(len(tokens), 2 + num_optypes)
1346-
self.assertEqual(token.tok_name[tokens[0].exact_type],
1347-
token.tok_name[ENCODING])
1346+
self.assertEqual(tok_name[tokens[0].exact_type],
1347+
tok_name[ENCODING])
13481348
for i in range(num_optypes):
1349-
self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1350-
token.tok_name[optypes[i]])
1351-
self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1352-
token.tok_name[token.ENDMARKER])
1349+
self.assertEqual(tok_name[tokens[i + 1].exact_type],
1350+
tok_name[optypes[i]])
1351+
self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
1352+
tok_name[token.ENDMARKER])
13531353

13541354
def test_exact_type(self):
13551355
self.assertExactTypeEqual('()', token.LPAR, token.RPAR)

Diff for: Lib/token.py

+24-5
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,17 @@
6363
ATEQUAL = 50
6464
RARROW = 51
6565
ELLIPSIS = 52
66+
# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
6667
OP = 53
6768
AWAIT = 54
6869
ASYNC = 55
6970
ERRORTOKEN = 56
70-
N_TOKENS = 57
71+
# These aren't used by the C tokenizer but are needed for tokenize.py
72+
COMMENT = 57
73+
NL = 58
74+
ENCODING = 59
75+
N_TOKENS = 60
76+
# Special definitions for cooperation with parser
7177
NT_OFFSET = 256
7278
#--end constants--
7379

@@ -102,15 +108,26 @@ def _main():
102108
with fp:
103109
lines = fp.read().split("\n")
104110
prog = re.compile(
105-
"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
111+
r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
106112
re.IGNORECASE)
113+
comment_regex = re.compile(
114+
r"^\s*/\*\s*(.+?)\s*\*/\s*$",
115+
re.IGNORECASE)
116+
107117
tokens = {}
118+
prev_val = None
108119
for line in lines:
109120
match = prog.match(line)
110121
if match:
111122
name, val = match.group(1, 2)
112123
val = int(val)
113-
tokens[val] = name # reverse so we can sort them...
124+
tokens[val] = {'token': name} # reverse so we can sort them...
125+
prev_val = val
126+
else:
127+
comment_match = comment_regex.match(line)
128+
if comment_match and prev_val is not None:
129+
comment = comment_match.group(1)
130+
tokens[prev_val]['comment'] = comment
114131
keys = sorted(tokens.keys())
115132
# load the output skeleton from the target:
116133
try:
@@ -127,8 +144,10 @@ def _main():
127144
sys.stderr.write("target does not contain format markers")
128145
sys.exit(3)
129146
lines = []
130-
for val in keys:
131-
lines.append("%s = %d" % (tokens[val], val))
147+
for key in keys:
148+
lines.append("%s = %d" % (tokens[key]["token"], key))
149+
if "comment" in tokens[key]:
150+
lines.append("# %s" % tokens[key]["comment"])
132151
format[start:end] = lines
133152
try:
134153
fp = open(outFileName, 'w')

Diff for: Lib/tokenize.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,10 @@
3838
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
3939

4040
import token
41-
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
42-
"NL", "untokenize", "ENCODING", "TokenInfo"]
41+
__all__ = token.__all__ + ["tokenize", "detect_encoding",
42+
"untokenize", "TokenInfo"]
4343
del token
4444

45-
COMMENT = N_TOKENS
46-
tok_name[COMMENT] = 'COMMENT'
47-
NL = N_TOKENS + 1
48-
tok_name[NL] = 'NL'
49-
ENCODING = N_TOKENS + 2
50-
tok_name[ENCODING] = 'ENCODING'
51-
N_TOKENS += 3
5245
EXACT_TOKEN_TYPES = {
5346
'(': LPAR,
5447
')': RPAR,

Diff for: Misc/NEWS

+4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ What's New in Python 3.7.0 alpha 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- bpo-25324: Tokens needed for parsing in Python moved to C. ``COMMENT``,
14+
``NL`` and ``ENCODING``. This way the tokens and tok_names in the token
15+
module don't get changed when you import the tokenize module.
16+
1317
- bpo-29104: Fixed parsing backslashes in f-strings.
1418

1519
- bpo-27945: Fixed various segfaults with dict when input collections are

Diff for: Parser/tokenizer.c

+3
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ const char *_PyParser_TokenNames[] = {
106106
"AWAIT",
107107
"ASYNC",
108108
"<ERRORTOKEN>",
109+
"COMMENT",
110+
"NL",
111+
"ENCODING"
109112
"<N_TOKENS>"
110113
};
111114

0 commit comments

Comments
 (0)