From 5f05ce97f1677099babe5e767f5493268ed3f7aa Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 6 Oct 2022 10:06:09 -0700 Subject: [PATCH 1/6] GH-97973: Return all necessary information from the tokenizer Right now, the tokenizer only returns type and two pointers to the start and end of the token. This PR modifies the tokenizer to return the type and set all of the necessary information, so that the parser does not have to this. --- Parser/pegen.c | 54 +++++----- Parser/pegen_errors.c | 5 +- Parser/tokenizer.c | 225 +++++++++++++++++++++------------------ Parser/tokenizer.h | 8 +- Python/Python-tokenize.c | 17 ++- 5 files changed, 163 insertions(+), 146 deletions(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index a5d123da51296c..b55f2744bef78c 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -123,16 +123,18 @@ growable_comment_array_deallocate(growable_comment_array *arr) { } static int -_get_keyword_or_name_type(Parser *p, const char *name, int name_len) +_get_keyword_or_name_type(Parser *p, struct token new_token) { + int name_len = new_token.end_col_offset - new_token.col_offset; assert(name_len > 0); + if (name_len >= p->n_keyword_lists || p->keywords[name_len] == NULL || p->keywords[name_len]->type == -1) { return NAME; } for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) { - if (strncmp(k->str, name, name_len) == 0) { + if (strncmp(k->str, new_token.start, name_len) == 0) { return k->type; } } @@ -140,33 +142,26 @@ _get_keyword_or_name_type(Parser *p, const char *name, int name_len) } static int -initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) { - assert(token != NULL); +initialize_token(Parser *p, Token *parser_token, struct token new_token, int token_type) { + assert(parser_token != NULL); - token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type; - token->bytes = PyBytes_FromStringAndSize(start, end - start); - if (token->bytes == NULL) { + parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type; + parser_token->bytes = PyBytes_FromStringAndSize(new_token.start, new_token.end - new_token.start); + if (parser_token->bytes == NULL) { return -1; } - - if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) { - Py_DECREF(token->bytes); + if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) { + Py_DECREF(parser_token->bytes); return -1; } - token->level = p->tok->level; - - const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start; - int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno; - int end_lineno = p->tok->lineno; - - int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1; - int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1; - - token->lineno = lineno; - token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset; - token->end_lineno = end_lineno; - token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset; + parser_token->level = new_token.level; + parser_token->lineno = new_token.lineno; + parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token.col_offset + : new_token.col_offset; + parser_token->end_lineno = new_token.end_lineno; + parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token.end_col_offset + : new_token.end_col_offset; p->fill += 1; @@ -202,26 +197,25 @@ _resize_tokens_array(Parser *p) { int _PyPegen_fill_token(Parser *p) { - const char *start; - const char *end; - int type = _PyTokenizer_Get(p->tok, &start, &end); + struct token new_token; + int type = _PyTokenizer_Get(p->tok, &new_token); // Record and skip '# type: ignore' comments while (type == TYPE_IGNORE) { - Py_ssize_t len = end - start; + Py_ssize_t len = new_token.end_col_offset - new_token.col_offset; char *tag = PyMem_Malloc(len + 1); if (tag == NULL) { PyErr_NoMemory(); return -1; } - strncpy(tag, start, len); + strncpy(tag, new_token.start, len); tag[len] = '\0'; // Ownership of tag passes to the growable array if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) { PyErr_NoMemory(); return -1; } - type = _PyTokenizer_Get(p->tok, &start, &end); + type = _PyTokenizer_Get(p->tok, &new_token); } // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing @@ -244,7 +238,7 @@ _PyPegen_fill_token(Parser *p) } Token *t = p->tokens[p->fill]; - return initialize_token(p, t, start, end, type); + return initialize_token(p, t, new_token, type); } #if defined(Py_DEBUG) diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index 95bbd43dc32621..7738cbaf9ef39e 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -164,11 +164,10 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { Py_ssize_t current_err_line = current_token->lineno; int ret = 0; + struct token new_token; for (;;) { - const char *start; - const char *end; - switch (_PyTokenizer_Get(p->tok, &start, &end)) { + switch (_PyTokenizer_Get(p->tok, &new_token)) { case ERRORTOKEN: if (p->tok->level != 0) { int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 3c37fd9c45a49e..67ce96903a73fc 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -36,6 +36,8 @@ /* Don't ever change this -- it would break the portability of Python code */ #define TABSIZE 8 +#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) + /* Forward */ static struct tok_state *tok_new(void); static int tok_nextc(struct tok_state *tok); @@ -1174,8 +1176,6 @@ syntaxerror_known_range(struct tok_state *tok, return ret; } - - static int indenterror(struct tok_state *tok) { @@ -1391,12 +1391,37 @@ tok_continuation_line(struct tok_state *tok) { } static int -tok_get(struct tok_state *tok, const char **p_start, const char **p_end) +token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end) +{ + // Default token values + token->lineno = -1; + token->end_lineno = -1; + token->col_offset = -1; + token->end_col_offset = -1; + token->start = NULL; + token->end = NULL; + + token->level = tok->level; + if (start != NULL && end != NULL) { + const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start; + token->lineno = type == STRING ? tok->first_lineno : tok->lineno; + token->end_lineno = tok->lineno; + token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1; + token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1; + token->start = start; + token->end = end; + } + return type; +} + +static int +tok_get(struct tok_state *tok, struct token *token) { int c; int blankline, nonascii; - *p_start = *p_end = NULL; + const char *p_start = NULL; + const char *p_end = NULL; nextline: tok->start = NULL; blankline = 0; @@ -1426,7 +1451,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) // the level of indentation of whatever comes next. cont_line_col = cont_line_col ? cont_line_col : col; if ((c = tok_continuation_line(tok)) == -1) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else { @@ -1461,7 +1486,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + return MAKE_TOKEN(indenterror(tok)); } } else if (col > tok->indstack[tok->indent]) { @@ -1469,10 +1494,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (tok->indent+1 >= MAXINDENT) { tok->done = E_TOODEEP; tok->cur = tok->inp; - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } if (altcol <= tok->altindstack[tok->indent]) { - return indenterror(tok); + return MAKE_TOKEN(indenterror(tok)); } tok->pendin++; tok->indstack[++tok->indent] = col; @@ -1488,10 +1513,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (col != tok->indstack[tok->indent]) { tok->done = E_DEDENT; tok->cur = tok->inp; - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + return MAKE_TOKEN(indenterror(tok)); } } } @@ -1503,11 +1528,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (tok->pendin != 0) { if (tok->pendin < 0) { tok->pendin++; - return DEDENT; + return MAKE_TOKEN(DEDENT); } else { tok->pendin--; - return INDENT; + return MAKE_TOKEN(INDENT); } } @@ -1587,34 +1612,34 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); if (is_type_ignore) { - *p_start = ignore_end; - *p_end = tok->cur; + p_start = ignore_end; + p_end = tok->cur; /* If this type ignore is the only thing on the line, consume the newline also. */ if (blankline) { tok_nextc(tok); tok->atbol = 1; } - return TYPE_IGNORE; + return MAKE_TOKEN(TYPE_IGNORE); } else { - *p_start = type_start; /* after type_comment_prefix */ - *p_end = tok->cur; - return TYPE_COMMENT; + p_start = type_start; + p_end = tok->cur; + return MAKE_TOKEN(TYPE_COMMENT); } } } } if (tok->done == E_INTERACT_STOP) { - return ENDMARKER; + return MAKE_TOKEN(ENDMARKER); } /* Check for EOF and errors now */ if (c == EOF) { if (tok->level) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } - return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; + return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); } /* Identifier (most frequent token!) */ @@ -1654,11 +1679,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } tok_backup(tok, c); if (nonascii && !verify_identifier(tok)) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } - *p_start = tok->start; - *p_end = tok->cur; + p_start = tok->start; + p_end = tok->cur; /* async/await parsing block. */ if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { @@ -1673,10 +1698,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (!tok->async_hacks || tok->async_def) { /* Always recognize the keywords. */ if (memcmp(tok->start, "async", 5) == 0) { - return ASYNC; + return MAKE_TOKEN(ASYNC); } if (memcmp(tok->start, "await", 5) == 0) { - return AWAIT; + return MAKE_TOKEN(AWAIT); } } else if (memcmp(tok->start, "async", 5) == 0) { @@ -1684,13 +1709,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) Look ahead one token to see if that is 'def'. */ struct tok_state ahead_tok; - const char *ahead_tok_start = NULL; - const char *ahead_tok_end = NULL; + struct token ahead_token; int ahead_tok_kind; memcpy(&ahead_tok, tok, sizeof(ahead_tok)); - ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, - &ahead_tok_end); + ahead_tok_kind = tok_get(&ahead_tok, &ahead_token); if (ahead_tok_kind == NAME && ahead_tok.cur - ahead_tok.start == 3 @@ -1700,12 +1723,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) returning a plain NAME token, return ASYNC. */ tok->async_def_indent = tok->indent; tok->async_def = 1; - return ASYNC; + return MAKE_TOKEN(ASYNC); } } } - return NAME; + return MAKE_TOKEN(NAME); } /* Newline */ @@ -1714,15 +1737,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (blankline || tok->level > 0) { goto nextline; } - *p_start = tok->start; - *p_end = tok->cur - 1; /* Leave '\n' out of the string */ + p_start = tok->start; + p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; if (tok->async_def) { /* We're somewhere inside an 'async def' function, and we've encountered a NEWLINE after its signature. */ tok->async_def_nl = 1; } - return NEWLINE; + return MAKE_TOKEN(NEWLINE); } /* Period or number starting with period? */ @@ -1733,9 +1756,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } else if (c == '.') { c = tok_nextc(tok); if (c == '.') { - *p_start = tok->start; - *p_end = tok->cur; - return ELLIPSIS; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(ELLIPSIS); } else { tok_backup(tok, c); @@ -1745,9 +1768,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) else { tok_backup(tok, c); } - *p_start = tok->start; - *p_end = tok->cur; - return DOT; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(DOT); } /* Number */ @@ -1764,14 +1787,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } if (!isxdigit(c)) { tok_backup(tok, c); - return syntaxerror(tok, "invalid hexadecimal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal")); } do { c = tok_nextc(tok); } while (isxdigit(c)); } while (c == '_'); if (!verify_end_of_number(tok, c, "hexadecimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'o' || c == 'O') { @@ -1783,12 +1806,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } if (c < '0' || c >= '8') { if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); + return MAKE_TOKEN(syntaxerror(tok, + "invalid digit '%c' in octal literal", c)); } else { tok_backup(tok, c); - return syntaxerror(tok, "invalid octal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal")); } } do { @@ -1796,11 +1819,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } while ('0' <= c && c < '8'); } while (c == '_'); if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); + return MAKE_TOKEN(syntaxerror(tok, + "invalid digit '%c' in octal literal", c)); } if (!verify_end_of_number(tok, c, "octal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'b' || c == 'B') { @@ -1812,12 +1835,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } if (c != '0' && c != '1') { if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); + return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } else { tok_backup(tok, c); - return syntaxerror(tok, "invalid binary literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal")); } } do { @@ -1825,11 +1847,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } while (c == '0' || c == '1'); } while (c == '_'); if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); + return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } if (!verify_end_of_number(tok, c, "binary")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else { @@ -1841,7 +1862,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) c = tok_nextc(tok); if (!isdigit(c)) { tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } if (c != '0') { @@ -1854,7 +1875,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) nonzero = 1; c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } if (c == '.') { @@ -1870,15 +1891,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) else if (nonzero) { /* Old-style octal: now disallowed. */ tok_backup(tok, c); - return syntaxerror_known_range( + return MAKE_TOKEN(syntaxerror_known_range( tok, (int)(tok->start + 1 - tok->line_start), (int)(zeros_end - tok->line_start), "leading zeros in decimal integer " "literals are not permitted; " - "use an 0o prefix for octal integers"); + "use an 0o prefix for octal integers")); } if (!verify_end_of_number(tok, c, "decimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } } @@ -1886,7 +1907,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Decimal */ c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } { /* Accept floating point numbers. */ @@ -1897,7 +1918,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (isdigit(c)) { c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } } @@ -1911,21 +1932,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) c = tok_nextc(tok); if (!isdigit(c)) { tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } else if (!isdigit(c)) { tok_backup(tok, c); if (!verify_end_of_number(tok, e, "decimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } tok_backup(tok, e); - *p_start = tok->start; - *p_end = tok->cur; - return NUMBER; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NUMBER); } c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } if (c == 'j' || c == 'J') { @@ -1933,18 +1954,18 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) imaginary: c = tok_nextc(tok); if (!verify_end_of_number(tok, c, "imaginary")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else if (!verify_end_of_number(tok, c, "decimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } } tok_backup(tok, c); - *p_start = tok->start; - *p_end = tok->cur; - return NUMBER; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NUMBER); } letter_quote: @@ -1997,7 +2018,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (c != '\n') { tok->done = E_EOFS; } - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } else { syntaxerror(tok, "unterminated string literal (detected at" @@ -2005,7 +2026,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (c != '\n') { tok->done = E_EOLS; } - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } if (c == quote) { @@ -2019,15 +2040,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } } - *p_start = tok->start; - *p_end = tok->cur; - return STRING; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(STRING); } /* Line continuation */ if (c == '\\') { if ((c = tok_continuation_line(tok)) == -1) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } tok->cont_line = 1; goto again; /* Read next line */ @@ -2036,19 +2057,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Check for two-character token */ { int c2 = tok_nextc(tok); - int token = _PyToken_TwoChars(c, c2); - if (token != OP) { + int current_token = _PyToken_TwoChars(c, c2); + if (current_token != OP) { int c3 = tok_nextc(tok); - int token3 = _PyToken_ThreeChars(c, c2, c3); - if (token3 != OP) { - token = token3; + int current_token3 = _PyToken_ThreeChars(c, c2, c3); + if (current_token3 != OP) { + current_token = current_token3; } else { tok_backup(tok, c3); } - *p_start = tok->start; - *p_end = tok->cur; - return token; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(current_token); } tok_backup(tok, c2); } @@ -2059,7 +2080,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) case '[': case '{': if (tok->level >= MAXLEVEL) { - return syntaxerror(tok, "too many nested parentheses"); + return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses")); } tok->parenstack[tok->level] = c; tok->parenlinenostack[tok->level] = tok->lineno; @@ -2070,7 +2091,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) case ']': case '}': if (!tok->level) { - return syntaxerror(tok, "unmatched '%c'", c); + return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c)); } tok->level--; int opening = tok->parenstack[tok->level]; @@ -2079,16 +2100,16 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) (opening == '{' && c == '}'))) { if (tok->parenlinenostack[tok->level] != tok->lineno) { - return syntaxerror(tok, + return MAKE_TOKEN(syntaxerror(tok, "closing parenthesis '%c' does not match " "opening parenthesis '%c' on line %d", - c, opening, tok->parenlinenostack[tok->level]); + c, opening, tok->parenlinenostack[tok->level])); } else { - return syntaxerror(tok, + return MAKE_TOKEN(syntaxerror(tok, "closing parenthesis '%c' does not match " "opening parenthesis '%c'", - c, opening); + c, opening)); } } break; @@ -2097,20 +2118,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (!Py_UNICODE_ISPRINTABLE(c)) { char hex[9]; (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c); - return syntaxerror(tok, "invalid non-printable character U+%s", hex); + return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex)); } /* Punctuation character */ - *p_start = tok->start; - *p_end = tok->cur; - return _PyToken_OneChar(c); + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(_PyToken_OneChar(c)); } int -_PyTokenizer_Get(struct tok_state *tok, - const char **p_start, const char **p_end) +_PyTokenizer_Get(struct tok_state *tok, struct token *token) { - int result = tok_get(tok, p_start, p_end); + int result = tok_get(tok, token); if (tok->decoding_erred) { result = ERRORTOKEN; tok->done = E_DECODE; @@ -2166,8 +2186,6 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) { struct tok_state *tok; FILE *fp; - const char *p_start = NULL; - const char *p_end = NULL; char *encoding = NULL; fp = fdopen_borrow(fd); @@ -2191,8 +2209,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) return encoding; } } + struct token token; while (tok->lineno < 2 && tok->done == E_OK) { - _PyTokenizer_Get(tok, &p_start, &p_end); + _PyTokenizer_Get(tok, &token); } fclose(fp); if (tok->encoding) { diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 5ac64a99b7d661..5b8c7f314386ec 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -27,6 +27,12 @@ enum interactive_underflow_t { IUNDERFLOW_STOP, }; +struct token { + int level; + int lineno, col_offset, end_lineno, end_col_offset; + const char *start, *end; +}; + /* Tokenizer state */ struct tok_state { /* Input state; buf <= cur <= inp <= end */ @@ -94,7 +100,7 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int); extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*, const char *, const char *); extern void _PyTokenizer_Free(struct tok_state *); -extern int _PyTokenizer_Get(struct tok_state *, const char **, const char **); +extern int _PyTokenizer_Get(struct tok_state *, struct token *); #define tok_dump _Py_tok_dump diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index c5124a6942e7f2..8daa9877254e2e 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -60,9 +60,8 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source) static PyObject * tokenizeriter_next(tokenizeriterobject *it) { - const char *start; - const char *end; - int type = _PyTokenizer_Get(it->tok, &start, &end); + struct token token; + int type = _PyTokenizer_Get(it->tok, &token); if (type == ERRORTOKEN && PyErr_Occurred()) { return NULL; } @@ -71,11 +70,11 @@ tokenizeriter_next(tokenizeriterobject *it) return NULL; } PyObject *str = NULL; - if (start == NULL || end == NULL) { + if (token.start == NULL || token.end == NULL) { str = PyUnicode_FromString(""); } else { - str = PyUnicode_FromStringAndSize(start, end - start); + str = PyUnicode_FromStringAndSize(token.start, token.end - token.start); } if (str == NULL) { return NULL; @@ -92,11 +91,11 @@ tokenizeriter_next(tokenizeriterobject *it) int end_lineno = it->tok->lineno; int col_offset = -1; int end_col_offset = -1; - if (start != NULL && start >= line_start) { - col_offset = (int)(start - line_start); + if (token.start != NULL && token.start >= line_start) { + col_offset = (int)(token.start - line_start); } - if (end != NULL && end >= it->tok->line_start) { - end_col_offset = (int)(end - it->tok->line_start); + if (token.end != NULL && token.end >= it->tok->line_start) { + end_col_offset = (int)(token.end - it->tok->line_start); } return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line); From f64566466fe4cdf83b3c58498fc3a0a9d3629ecb Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 6 Oct 2022 17:13:36 +0000 Subject: [PATCH 2/6] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2022-10-06-17-13-33.gh-issue-97973.gB-xWi.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-10-06-17-13-33.gh-issue-97973.gB-xWi.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-06-17-13-33.gh-issue-97973.gB-xWi.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-17-13-33.gh-issue-97973.gB-xWi.rst new file mode 100644 index 00000000000000..a0095a61ec32b7 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-17-13-33.gh-issue-97973.gB-xWi.rst @@ -0,0 +1 @@ +Modify the tokenizer to return all necessary information the parser needs to set location information in the AST nodes, so that the parser does not have to calculate those doing pointer arithmetic. From e892553f791e232e05d687ff9d73a94dc982b1d6 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 6 Oct 2022 10:55:49 -0700 Subject: [PATCH 3/6] Fix tests by using the correct defaults for line number --- Parser/tokenizer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 67ce96903a73fc..afa0a8c3d8f400 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1394,8 +1394,8 @@ static int token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end) { // Default token values - token->lineno = -1; - token->end_lineno = -1; + token->lineno = type == STRING ? tok->first_lineno : tok->lineno; + token->end_lineno = tok->lineno;; token->col_offset = -1; token->end_col_offset = -1; token->start = NULL; From 0261165943dba467a501798c7011d1a8d1d39d31 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 6 Oct 2022 11:23:43 -0700 Subject: [PATCH 4/6] Fix token_setup --- Parser/tokenizer.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index afa0a8c3d8f400..98b265eaf140b3 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1393,23 +1393,18 @@ tok_continuation_line(struct tok_state *tok) { static int token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end) { - // Default token values + token->level = tok->level; token->lineno = type == STRING ? tok->first_lineno : tok->lineno; - token->end_lineno = tok->lineno;; + token->end_lineno = tok->lineno; token->col_offset = -1; token->end_col_offset = -1; - token->start = NULL; - token->end = NULL; + token->start = start; + token->end = end; - token->level = tok->level; if (start != NULL && end != NULL) { const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start; - token->lineno = type == STRING ? tok->first_lineno : tok->lineno; - token->end_lineno = tok->lineno; token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1; token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1; - token->start = start; - token->end = end; } return type; } From 25c1004a83364b440a87779556e117a3828fa301 Mon Sep 17 00:00:00 2001 From: Kumar Aditya <59607654+kumaraditya303@users.noreply.github.com> Date: Thu, 6 Oct 2022 22:48:19 +0530 Subject: [PATCH 5/6] GH-88050: fix race in closing subprocess pipe in asyncio (#97951) Check for None when iterating over `self._pipes.values()`. --- Lib/asyncio/base_subprocess.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/asyncio/base_subprocess.py b/Lib/asyncio/base_subprocess.py index c2ca4a2792f666..e15bb4141fc02a 100644 --- a/Lib/asyncio/base_subprocess.py +++ b/Lib/asyncio/base_subprocess.py @@ -216,7 +216,9 @@ def _process_exited(self, returncode): self._proc.returncode = returncode self._call(self._protocol.process_exited) for p in self._pipes.values(): - p.pipe.close() + if p is not None: + p.pipe.close() + self._try_finish() async def _wait(self): From b3b300e0387b1186397edc910e2ba6ae84227755 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 6 Oct 2022 13:39:17 -0400 Subject: [PATCH 6/6] gh-94808: Coverage: Test that maximum indentation level is handled (#95926) * gh-94808: Coverage: Test that maximum indentation level is handled * Use "compile" rather than "exec" --- Lib/test/test_tokenize.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 1272e1e9be002e..47f2c06685bcaa 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -3,7 +3,7 @@ from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, open as tokenize_open, Untokenizer, generate_tokens, - NEWLINE, _generate_tokens_from_c_tokenizer) + NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT) from io import BytesIO, StringIO import unittest from textwrap import dedent @@ -2512,6 +2512,26 @@ def get_tokens(string): self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000) self.assertRaises(SyntaxError, get_tokens, "]") + def test_max_indent(self): + MAXINDENT = 100 + + def generate_source(indents): + source = ''.join((' ' * x) + 'if True:\n' for x in range(indents)) + source += ' ' * indents + 'pass\n' + return source + + valid = generate_source(MAXINDENT - 1) + tokens = list(_generate_tokens_from_c_tokenizer(valid)) + self.assertEqual(tokens[-1].type, DEDENT) + compile(valid, "", "exec") + + invalid = generate_source(MAXINDENT) + tokens = list(_generate_tokens_from_c_tokenizer(invalid)) + self.assertEqual(tokens[-1].type, NEWLINE) + self.assertRaises( + IndentationError, compile, invalid, "", "exec" + ) + def test_continuation_lines_indentation(self): def get_tokens(string): return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]