Skip to content

Commit cbf0afd

Browse files
authored
pythongh-97973: Return all necessary information from the tokenizer (pythonGH-97984)
Right now, the tokenizer only returns type and two pointers to the start and end of the token. This PR modifies the tokenizer to return the type and set all of the necessary information, so that the parser does not have to this.
1 parent b9d2e81 commit cbf0afd

File tree

6 files changed

+159
-146
lines changed

6 files changed

+159
-146
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Modify the tokenizer to return all necessary information the parser needs to set location information in the AST nodes, so that the parser does not have to calculate those doing pointer arithmetic.

Parser/pegen.c

+24-30
Original file line numberDiff line numberDiff line change
@@ -123,50 +123,45 @@ growable_comment_array_deallocate(growable_comment_array *arr) {
123123
}
124124

125125
static int
126-
_get_keyword_or_name_type(Parser *p, const char *name, int name_len)
126+
_get_keyword_or_name_type(Parser *p, struct token *new_token)
127127
{
128+
int name_len = new_token->end_col_offset - new_token->col_offset;
128129
assert(name_len > 0);
130+
129131
if (name_len >= p->n_keyword_lists ||
130132
p->keywords[name_len] == NULL ||
131133
p->keywords[name_len]->type == -1) {
132134
return NAME;
133135
}
134136
for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
135-
if (strncmp(k->str, name, name_len) == 0) {
137+
if (strncmp(k->str, new_token->start, name_len) == 0) {
136138
return k->type;
137139
}
138140
}
139141
return NAME;
140142
}
141143

142144
static int
143-
initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
144-
assert(token != NULL);
145+
initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
146+
assert(parser_token != NULL);
145147

146-
token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type;
147-
token->bytes = PyBytes_FromStringAndSize(start, end - start);
148-
if (token->bytes == NULL) {
148+
parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
149+
parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
150+
if (parser_token->bytes == NULL) {
149151
return -1;
150152
}
151-
152-
if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) {
153-
Py_DECREF(token->bytes);
153+
if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
154+
Py_DECREF(parser_token->bytes);
154155
return -1;
155156
}
156157

157-
token->level = p->tok->level;
158-
159-
const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start;
160-
int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno;
161-
int end_lineno = p->tok->lineno;
162-
163-
int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1;
164-
int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1;
165-
166-
token->lineno = lineno;
167-
token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset;
168-
token->end_lineno = end_lineno;
169-
token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset;
158+
parser_token->level = new_token->level;
159+
parser_token->lineno = new_token->lineno;
160+
parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
161+
: new_token->col_offset;
162+
parser_token->end_lineno = new_token->end_lineno;
163+
parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
164+
: new_token->end_col_offset;
170165

171166
p->fill += 1;
172167

@@ -202,26 +197,25 @@ _resize_tokens_array(Parser *p) {
202197
int
203198
_PyPegen_fill_token(Parser *p)
204199
{
205-
const char *start;
206-
const char *end;
207-
int type = _PyTokenizer_Get(p->tok, &start, &end);
200+
struct token new_token;
201+
int type = _PyTokenizer_Get(p->tok, &new_token);
208202

209203
// Record and skip '# type: ignore' comments
210204
while (type == TYPE_IGNORE) {
211-
Py_ssize_t len = end - start;
205+
Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
212206
char *tag = PyMem_Malloc(len + 1);
213207
if (tag == NULL) {
214208
PyErr_NoMemory();
215209
return -1;
216210
}
217-
strncpy(tag, start, len);
211+
strncpy(tag, new_token.start, len);
218212
tag[len] = '\0';
219213
// Ownership of tag passes to the growable array
220214
if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
221215
PyErr_NoMemory();
222216
return -1;
223217
}
224-
type = _PyTokenizer_Get(p->tok, &start, &end);
218+
type = _PyTokenizer_Get(p->tok, &new_token);
225219
}
226220

227221
// If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
@@ -244,7 +238,7 @@ _PyPegen_fill_token(Parser *p)
244238
}
245239

246240
Token *t = p->tokens[p->fill];
247-
return initialize_token(p, t, start, end, type);
241+
return initialize_token(p, t, &new_token, type);
248242
}
249243

250244
#if defined(Py_DEBUG)

Parser/pegen_errors.c

+2-3
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,10 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
164164
Py_ssize_t current_err_line = current_token->lineno;
165165

166166
int ret = 0;
167+
struct token new_token;
167168

168169
for (;;) {
169-
const char *start;
170-
const char *end;
171-
switch (_PyTokenizer_Get(p->tok, &start, &end)) {
170+
switch (_PyTokenizer_Get(p->tok, &new_token)) {
172171
case ERRORTOKEN:
173172
if (p->tok->level != 0) {
174173
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];

0 commit comments

Comments
 (0)