Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-40593: Improve syntax errors for invalid characters in source code. #20033

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
and where the hash values are equal (i.e. a very probable match) */
PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);

PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);

#ifdef __cplusplus
}
#endif
1 change: 0 additions & 1 deletion Include/errcode.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ extern "C" {
#define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
#define E_IDENTIFIER 26 /* Invalid characters in identifier */
#define E_BADSINGLE 27 /* Ill-formed single statement input */

#ifdef __cplusplus
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_fstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ def test_missing_expression(self):
])

# Different error message is raised for other whitespace characters.
self.assertAllRaise(SyntaxError, 'invalid character in identifier',
self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0",
["f'''{\xa0}'''",
"\xa0",
])
Expand Down
3 changes: 3 additions & 0 deletions Lib/test/test_source_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ def test_issue7820(self):
# one byte in common with the UTF-16-LE BOM
self.assertRaises(SyntaxError, eval, b'\xff\x20')

# one byte in common with the UTF-8 BOM
self.assertRaises(SyntaxError, eval, b'\xef\x20')

# two bytes in common with the UTF-8 BOM
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')

Expand Down
8 changes: 5 additions & 3 deletions Lib/test/test_unicode_identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ def test_non_bmp_normalized(self):
def test_invalid(self):
try:
from test import badsyntax_3131
except SyntaxError as s:
self.assertEqual(str(s),
"invalid character in identifier (badsyntax_3131.py, line 2)")
except SyntaxError as err:
self.assertEqual(str(err),
"invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)")
self.assertEqual(err.lineno, 2)
self.assertEqual(err.offset, 1)
else:
self.fail("expected exception didn't occur")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improved syntax errors for invalid characters in source code.
64 changes: 41 additions & 23 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self)
Py_RETURN_TRUE;
}

int
PyUnicode_IsIdentifier(PyObject *self)
Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject *self)
{
Py_ssize_t i;
int ready = PyUnicode_IS_READY(self);
if (PyUnicode_READY(self) == -1)
return -1;

Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
if (len == 0) {
/* an empty string is not a valid identifier */
return 0;
}

int kind = 0;
const void *data = NULL;
const wchar_t *wstr = NULL;
Py_UCS4 ch;
if (ready) {
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
ch = PyUnicode_READ(kind, data, 0);
}
else {
wstr = _PyUnicode_WSTR(self);
ch = wstr[0];
}
int kind = PyUnicode_KIND(self);
const void *data = PyUnicode_DATA(self);
Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
/* PEP 3131 says that the first character must be in
XID_Start and subsequent characters in XID_Continue,
and for the ASCII range, the 2.x rules apply (i.e
Expand All @@ -12347,17 +12338,44 @@ PyUnicode_IsIdentifier(PyObject *self)
}

for (i = 1; i < len; i++) {
if (ready) {
ch = PyUnicode_READ(kind, data, i);
ch = PyUnicode_READ(kind, data, i);
if (!_PyUnicode_IsXidContinue(ch)) {
return i;
}
else {
ch = wstr[i];
}
return i;
}

int
PyUnicode_IsIdentifier(PyObject *self)
{
if (PyUnicode_IS_READY(self)) {
Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
/* an empty string is not a valid identifier */
return len && i == len;
}
else {
Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
if (len == 0) {
/* an empty string is not a valid identifier */
return 0;
}
if (!_PyUnicode_IsXidContinue(ch)) {

const wchar_t *wstr = _PyUnicode_WSTR(self);
Py_UCS4 ch = wstr[0];
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
return 0;
}

for (i = 1; i < len; i++) {
ch = wstr[i];
if (!_PyUnicode_IsXidContinue(ch)) {
return 0;
}
}
return 1;
}
return 1;
}

/*[clinic input]
Expand Down
3 changes: 0 additions & 3 deletions Parser/pegen/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -337,9 +337,6 @@ tokenizer_error(Parser *p)
case E_TOKEN:
msg = "invalid token";
break;
case E_IDENTIFIER:
msg = "invalid character in identifier";
break;
case E_EOFS:
RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal");
return -1;
Expand Down
46 changes: 37 additions & 9 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1101,25 +1101,53 @@ static int
verify_identifier(struct tok_state *tok)
{
PyObject *s;
int result;
if (tok->decoding_erred)
return 0;
s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
if (s == NULL) {
if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
PyErr_Clear();
tok->done = E_IDENTIFIER;
} else {
tok->done = E_DECODE;
}
else {
tok->done = E_ERROR;
}
return 0;
}
result = PyUnicode_IsIdentifier(s);
Py_DECREF(s);
if (result == 0) {
tok->done = E_IDENTIFIER;
Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
if (invalid < 0) {
Py_DECREF(s);
tok->done = E_ERROR;
return 0;
}
return result;
assert(PyUnicode_GET_LENGTH(s) > 0);
if (invalid < PyUnicode_GET_LENGTH(s)) {
Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
/* Determine the offset in UTF-8 encoded input */
Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
if (s != NULL) {
Py_SETREF(s, PyUnicode_AsUTF8String(s));
}
if (s == NULL) {
tok->done = E_ERROR;
return 0;
}
tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
}
Py_DECREF(s);
// PyUnicode_FromFormatV() does not support %X
char hex[9];
snprintf(hex, sizeof(hex), "%04X", ch);
if (Py_UNICODE_ISPRINTABLE(ch)) {
syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
}
else {
syntaxerror(tok, "invalid non-printable character U+%s", hex);
}
return 0;
}
Py_DECREF(s);
return 1;
}

static int
Expand Down
3 changes: 0 additions & 3 deletions Python/pythonrun.c
Original file line number Diff line number Diff line change
Expand Up @@ -1603,9 +1603,6 @@ err_input(perrdetail *err)
msg = "unexpected character after line continuation character";
break;

case E_IDENTIFIER:
msg = "invalid character in identifier";
break;
case E_BADSINGLE:
msg = "multiple statements found while compiling a single statement";
break;
Expand Down