diff --git a/Parser/pegen.c b/Parser/pegen.c index a5d123da51296c..b55f2744bef78c 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -123,16 +123,18 @@ growable_comment_array_deallocate(growable_comment_array *arr) { } static int -_get_keyword_or_name_type(Parser *p, const char *name, int name_len) +_get_keyword_or_name_type(Parser *p, struct token new_token) { + int name_len = new_token.end_col_offset - new_token.col_offset; assert(name_len > 0); + if (name_len >= p->n_keyword_lists || p->keywords[name_len] == NULL || p->keywords[name_len]->type == -1) { return NAME; } for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) { - if (strncmp(k->str, name, name_len) == 0) { + if (strncmp(k->str, new_token.start, name_len) == 0) { return k->type; } } @@ -140,33 +142,26 @@ _get_keyword_or_name_type(Parser *p, const char *name, int name_len) } static int -initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) { - assert(token != NULL); +initialize_token(Parser *p, Token *parser_token, struct token new_token, int token_type) { + assert(parser_token != NULL); - token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type; - token->bytes = PyBytes_FromStringAndSize(start, end - start); - if (token->bytes == NULL) { + parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type; + parser_token->bytes = PyBytes_FromStringAndSize(new_token.start, new_token.end - new_token.start); + if (parser_token->bytes == NULL) { return -1; } - - if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) { - Py_DECREF(token->bytes); + if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) { + Py_DECREF(parser_token->bytes); return -1; } - token->level = p->tok->level; - - const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start; - int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno; - int end_lineno = p->tok->lineno; - - int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1; - int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1; - - token->lineno = lineno; - token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset; - token->end_lineno = end_lineno; - token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset; + parser_token->level = new_token.level; + parser_token->lineno = new_token.lineno; + parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token.col_offset + : new_token.col_offset; + parser_token->end_lineno = new_token.end_lineno; + parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token.end_col_offset + : new_token.end_col_offset; p->fill += 1; @@ -202,26 +197,25 @@ _resize_tokens_array(Parser *p) { int _PyPegen_fill_token(Parser *p) { - const char *start; - const char *end; - int type = _PyTokenizer_Get(p->tok, &start, &end); + struct token new_token; + int type = _PyTokenizer_Get(p->tok, &new_token); // Record and skip '# type: ignore' comments while (type == TYPE_IGNORE) { - Py_ssize_t len = end - start; + Py_ssize_t len = new_token.end_col_offset - new_token.col_offset; char *tag = PyMem_Malloc(len + 1); if (tag == NULL) { PyErr_NoMemory(); return -1; } - strncpy(tag, start, len); + strncpy(tag, new_token.start, len); tag[len] = '\0'; // Ownership of tag passes to the growable array if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) { PyErr_NoMemory(); return -1; } - type = _PyTokenizer_Get(p->tok, &start, &end); + type = _PyTokenizer_Get(p->tok, &new_token); } // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing @@ -244,7 +238,7 @@ _PyPegen_fill_token(Parser *p) } Token *t = p->tokens[p->fill]; - return initialize_token(p, t, start, end, type); + return initialize_token(p, t, new_token, type); } #if defined(Py_DEBUG) diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index 95bbd43dc32621..7738cbaf9ef39e 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -164,11 +164,10 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) { Py_ssize_t current_err_line = current_token->lineno; int ret = 0; + struct token new_token; for (;;) { - const char *start; - const char *end; - switch (_PyTokenizer_Get(p->tok, &start, &end)) { + switch (_PyTokenizer_Get(p->tok, &new_token)) { case ERRORTOKEN: if (p->tok->level != 0) { int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 3c37fd9c45a49e..67ce96903a73fc 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -36,6 +36,8 @@ /* Don't ever change this -- it would break the portability of Python code */ #define TABSIZE 8 +#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) + /* Forward */ static struct tok_state *tok_new(void); static int tok_nextc(struct tok_state *tok); @@ -1174,8 +1176,6 @@ syntaxerror_known_range(struct tok_state *tok, return ret; } - - static int indenterror(struct tok_state *tok) { @@ -1391,12 +1391,37 @@ tok_continuation_line(struct tok_state *tok) { } static int -tok_get(struct tok_state *tok, const char **p_start, const char **p_end) +token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end) +{ + // Default token values + token->lineno = -1; + token->end_lineno = -1; + token->col_offset = -1; + token->end_col_offset = -1; + token->start = NULL; + token->end = NULL; + + token->level = tok->level; + if (start != NULL && end != NULL) { + const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start; + token->lineno = type == STRING ? tok->first_lineno : tok->lineno; + token->end_lineno = tok->lineno; + token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1; + token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1; + token->start = start; + token->end = end; + } + return type; +} + +static int +tok_get(struct tok_state *tok, struct token *token) { int c; int blankline, nonascii; - *p_start = *p_end = NULL; + const char *p_start = NULL; + const char *p_end = NULL; nextline: tok->start = NULL; blankline = 0; @@ -1426,7 +1451,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) // the level of indentation of whatever comes next. cont_line_col = cont_line_col ? cont_line_col : col; if ((c = tok_continuation_line(tok)) == -1) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else { @@ -1461,7 +1486,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + return MAKE_TOKEN(indenterror(tok)); } } else if (col > tok->indstack[tok->indent]) { @@ -1469,10 +1494,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (tok->indent+1 >= MAXINDENT) { tok->done = E_TOODEEP; tok->cur = tok->inp; - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } if (altcol <= tok->altindstack[tok->indent]) { - return indenterror(tok); + return MAKE_TOKEN(indenterror(tok)); } tok->pendin++; tok->indstack[++tok->indent] = col; @@ -1488,10 +1513,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (col != tok->indstack[tok->indent]) { tok->done = E_DEDENT; tok->cur = tok->inp; - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + return MAKE_TOKEN(indenterror(tok)); } } } @@ -1503,11 +1528,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (tok->pendin != 0) { if (tok->pendin < 0) { tok->pendin++; - return DEDENT; + return MAKE_TOKEN(DEDENT); } else { tok->pendin--; - return INDENT; + return MAKE_TOKEN(INDENT); } } @@ -1587,34 +1612,34 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); if (is_type_ignore) { - *p_start = ignore_end; - *p_end = tok->cur; + p_start = ignore_end; + p_end = tok->cur; /* If this type ignore is the only thing on the line, consume the newline also. */ if (blankline) { tok_nextc(tok); tok->atbol = 1; } - return TYPE_IGNORE; + return MAKE_TOKEN(TYPE_IGNORE); } else { - *p_start = type_start; /* after type_comment_prefix */ - *p_end = tok->cur; - return TYPE_COMMENT; + p_start = type_start; + p_end = tok->cur; + return MAKE_TOKEN(TYPE_COMMENT); } } } } if (tok->done == E_INTERACT_STOP) { - return ENDMARKER; + return MAKE_TOKEN(ENDMARKER); } /* Check for EOF and errors now */ if (c == EOF) { if (tok->level) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } - return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; + return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); } /* Identifier (most frequent token!) */ @@ -1654,11 +1679,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } tok_backup(tok, c); if (nonascii && !verify_identifier(tok)) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } - *p_start = tok->start; - *p_end = tok->cur; + p_start = tok->start; + p_end = tok->cur; /* async/await parsing block. */ if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { @@ -1673,10 +1698,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (!tok->async_hacks || tok->async_def) { /* Always recognize the keywords. */ if (memcmp(tok->start, "async", 5) == 0) { - return ASYNC; + return MAKE_TOKEN(ASYNC); } if (memcmp(tok->start, "await", 5) == 0) { - return AWAIT; + return MAKE_TOKEN(AWAIT); } } else if (memcmp(tok->start, "async", 5) == 0) { @@ -1684,13 +1709,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) Look ahead one token to see if that is 'def'. */ struct tok_state ahead_tok; - const char *ahead_tok_start = NULL; - const char *ahead_tok_end = NULL; + struct token ahead_token; int ahead_tok_kind; memcpy(&ahead_tok, tok, sizeof(ahead_tok)); - ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, - &ahead_tok_end); + ahead_tok_kind = tok_get(&ahead_tok, &ahead_token); if (ahead_tok_kind == NAME && ahead_tok.cur - ahead_tok.start == 3 @@ -1700,12 +1723,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) returning a plain NAME token, return ASYNC. */ tok->async_def_indent = tok->indent; tok->async_def = 1; - return ASYNC; + return MAKE_TOKEN(ASYNC); } } } - return NAME; + return MAKE_TOKEN(NAME); } /* Newline */ @@ -1714,15 +1737,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (blankline || tok->level > 0) { goto nextline; } - *p_start = tok->start; - *p_end = tok->cur - 1; /* Leave '\n' out of the string */ + p_start = tok->start; + p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; if (tok->async_def) { /* We're somewhere inside an 'async def' function, and we've encountered a NEWLINE after its signature. */ tok->async_def_nl = 1; } - return NEWLINE; + return MAKE_TOKEN(NEWLINE); } /* Period or number starting with period? */ @@ -1733,9 +1756,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } else if (c == '.') { c = tok_nextc(tok); if (c == '.') { - *p_start = tok->start; - *p_end = tok->cur; - return ELLIPSIS; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(ELLIPSIS); } else { tok_backup(tok, c); @@ -1745,9 +1768,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) else { tok_backup(tok, c); } - *p_start = tok->start; - *p_end = tok->cur; - return DOT; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(DOT); } /* Number */ @@ -1764,14 +1787,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } if (!isxdigit(c)) { tok_backup(tok, c); - return syntaxerror(tok, "invalid hexadecimal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal")); } do { c = tok_nextc(tok); } while (isxdigit(c)); } while (c == '_'); if (!verify_end_of_number(tok, c, "hexadecimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'o' || c == 'O') { @@ -1783,12 +1806,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } if (c < '0' || c >= '8') { if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); + return MAKE_TOKEN(syntaxerror(tok, + "invalid digit '%c' in octal literal", c)); } else { tok_backup(tok, c); - return syntaxerror(tok, "invalid octal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal")); } } do { @@ -1796,11 +1819,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } while ('0' <= c && c < '8'); } while (c == '_'); if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); + return MAKE_TOKEN(syntaxerror(tok, + "invalid digit '%c' in octal literal", c)); } if (!verify_end_of_number(tok, c, "octal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else if (c == 'b' || c == 'B') { @@ -1812,12 +1835,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } if (c != '0' && c != '1') { if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); + return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } else { tok_backup(tok, c); - return syntaxerror(tok, "invalid binary literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal")); } } do { @@ -1825,11 +1847,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } while (c == '0' || c == '1'); } while (c == '_'); if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); + return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } if (!verify_end_of_number(tok, c, "binary")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else { @@ -1841,7 +1862,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) c = tok_nextc(tok); if (!isdigit(c)) { tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } if (c != '0') { @@ -1854,7 +1875,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) nonzero = 1; c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } if (c == '.') { @@ -1870,15 +1891,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) else if (nonzero) { /* Old-style octal: now disallowed. */ tok_backup(tok, c); - return syntaxerror_known_range( + return MAKE_TOKEN(syntaxerror_known_range( tok, (int)(tok->start + 1 - tok->line_start), (int)(zeros_end - tok->line_start), "leading zeros in decimal integer " "literals are not permitted; " - "use an 0o prefix for octal integers"); + "use an 0o prefix for octal integers")); } if (!verify_end_of_number(tok, c, "decimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } } @@ -1886,7 +1907,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Decimal */ c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } { /* Accept floating point numbers. */ @@ -1897,7 +1918,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (isdigit(c)) { c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } } @@ -1911,21 +1932,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) c = tok_nextc(tok); if (!isdigit(c)) { tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); + return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } else if (!isdigit(c)) { tok_backup(tok, c); if (!verify_end_of_number(tok, e, "decimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } tok_backup(tok, e); - *p_start = tok->start; - *p_end = tok->cur; - return NUMBER; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NUMBER); } c = tok_decimal_tail(tok); if (c == 0) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } if (c == 'j' || c == 'J') { @@ -1933,18 +1954,18 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) imaginary: c = tok_nextc(tok); if (!verify_end_of_number(tok, c, "imaginary")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } else if (!verify_end_of_number(tok, c, "decimal")) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } } tok_backup(tok, c); - *p_start = tok->start; - *p_end = tok->cur; - return NUMBER; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(NUMBER); } letter_quote: @@ -1997,7 +2018,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (c != '\n') { tok->done = E_EOFS; } - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } else { syntaxerror(tok, "unterminated string literal (detected at" @@ -2005,7 +2026,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (c != '\n') { tok->done = E_EOLS; } - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } } if (c == quote) { @@ -2019,15 +2040,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } } - *p_start = tok->start; - *p_end = tok->cur; - return STRING; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(STRING); } /* Line continuation */ if (c == '\\') { if ((c = tok_continuation_line(tok)) == -1) { - return ERRORTOKEN; + return MAKE_TOKEN(ERRORTOKEN); } tok->cont_line = 1; goto again; /* Read next line */ @@ -2036,19 +2057,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Check for two-character token */ { int c2 = tok_nextc(tok); - int token = _PyToken_TwoChars(c, c2); - if (token != OP) { + int current_token = _PyToken_TwoChars(c, c2); + if (current_token != OP) { int c3 = tok_nextc(tok); - int token3 = _PyToken_ThreeChars(c, c2, c3); - if (token3 != OP) { - token = token3; + int current_token3 = _PyToken_ThreeChars(c, c2, c3); + if (current_token3 != OP) { + current_token = current_token3; } else { tok_backup(tok, c3); } - *p_start = tok->start; - *p_end = tok->cur; - return token; + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(current_token); } tok_backup(tok, c2); } @@ -2059,7 +2080,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) case '[': case '{': if (tok->level >= MAXLEVEL) { - return syntaxerror(tok, "too many nested parentheses"); + return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses")); } tok->parenstack[tok->level] = c; tok->parenlinenostack[tok->level] = tok->lineno; @@ -2070,7 +2091,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) case ']': case '}': if (!tok->level) { - return syntaxerror(tok, "unmatched '%c'", c); + return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c)); } tok->level--; int opening = tok->parenstack[tok->level]; @@ -2079,16 +2100,16 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) (opening == '{' && c == '}'))) { if (tok->parenlinenostack[tok->level] != tok->lineno) { - return syntaxerror(tok, + return MAKE_TOKEN(syntaxerror(tok, "closing parenthesis '%c' does not match " "opening parenthesis '%c' on line %d", - c, opening, tok->parenlinenostack[tok->level]); + c, opening, tok->parenlinenostack[tok->level])); } else { - return syntaxerror(tok, + return MAKE_TOKEN(syntaxerror(tok, "closing parenthesis '%c' does not match " "opening parenthesis '%c'", - c, opening); + c, opening)); } } break; @@ -2097,20 +2118,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) if (!Py_UNICODE_ISPRINTABLE(c)) { char hex[9]; (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c); - return syntaxerror(tok, "invalid non-printable character U+%s", hex); + return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex)); } /* Punctuation character */ - *p_start = tok->start; - *p_end = tok->cur; - return _PyToken_OneChar(c); + p_start = tok->start; + p_end = tok->cur; + return MAKE_TOKEN(_PyToken_OneChar(c)); } int -_PyTokenizer_Get(struct tok_state *tok, - const char **p_start, const char **p_end) +_PyTokenizer_Get(struct tok_state *tok, struct token *token) { - int result = tok_get(tok, p_start, p_end); + int result = tok_get(tok, token); if (tok->decoding_erred) { result = ERRORTOKEN; tok->done = E_DECODE; @@ -2166,8 +2186,6 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) { struct tok_state *tok; FILE *fp; - const char *p_start = NULL; - const char *p_end = NULL; char *encoding = NULL; fp = fdopen_borrow(fd); @@ -2191,8 +2209,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) return encoding; } } + struct token token; while (tok->lineno < 2 && tok->done == E_OK) { - _PyTokenizer_Get(tok, &p_start, &p_end); + _PyTokenizer_Get(tok, &token); } fclose(fp); if (tok->encoding) { diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 5ac64a99b7d661..5b8c7f314386ec 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -27,6 +27,12 @@ enum interactive_underflow_t { IUNDERFLOW_STOP, }; +struct token { + int level; + int lineno, col_offset, end_lineno, end_col_offset; + const char *start, *end; +}; + /* Tokenizer state */ struct tok_state { /* Input state; buf <= cur <= inp <= end */ @@ -94,7 +100,7 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int); extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*, const char *, const char *); extern void _PyTokenizer_Free(struct tok_state *); -extern int _PyTokenizer_Get(struct tok_state *, const char **, const char **); +extern int _PyTokenizer_Get(struct tok_state *, struct token *); #define tok_dump _Py_tok_dump diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index c5124a6942e7f2..8daa9877254e2e 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -60,9 +60,8 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source) static PyObject * tokenizeriter_next(tokenizeriterobject *it) { - const char *start; - const char *end; - int type = _PyTokenizer_Get(it->tok, &start, &end); + struct token token; + int type = _PyTokenizer_Get(it->tok, &token); if (type == ERRORTOKEN && PyErr_Occurred()) { return NULL; } @@ -71,11 +70,11 @@ tokenizeriter_next(tokenizeriterobject *it) return NULL; } PyObject *str = NULL; - if (start == NULL || end == NULL) { + if (token.start == NULL || token.end == NULL) { str = PyUnicode_FromString(""); } else { - str = PyUnicode_FromStringAndSize(start, end - start); + str = PyUnicode_FromStringAndSize(token.start, token.end - token.start); } if (str == NULL) { return NULL; @@ -92,11 +91,11 @@ tokenizeriter_next(tokenizeriterobject *it) int end_lineno = it->tok->lineno; int col_offset = -1; int end_col_offset = -1; - if (start != NULL && start >= line_start) { - col_offset = (int)(start - line_start); + if (token.start != NULL && token.start >= line_start) { + col_offset = (int)(token.start - line_start); } - if (end != NULL && end >= it->tok->line_start) { - end_col_offset = (int)(end - it->tok->line_start); + if (token.end != NULL && token.end >= it->tok->line_start) { + end_col_offset = (int)(token.end - it->tok->line_start); } return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);