From 74ea6b5a7501fb393cd567fb21998d0bfeeb267c Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 12 May 2020 12:42:04 +0300 Subject: [PATCH] bpo-40593: Improve syntax errors for invalid characters in source code. (GH-20033) --- Include/cpython/unicodeobject.h | 2 + Include/errcode.h | 1 - Lib/test/test_fstring.py | 2 +- Lib/test/test_source_encoding.py | 3 + Lib/test/test_unicode_identifiers.py | 8 ++- .../2020-05-11-13-50-52.bpo-40593.yuOXj3.rst | 1 + Objects/unicodeobject.c | 64 ++++++++++++------- Parser/pegen/pegen.c | 3 - Parser/tokenizer.c | 46 ++++++++++--- Python/pythonrun.c | 3 - 10 files changed, 90 insertions(+), 43 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 81a35cdc801d09..94326876292b63 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -1222,6 +1222,8 @@ PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); and where the hash values are equal (i.e. a very probable match) */ PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); +PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *); + #ifdef __cplusplus } #endif diff --git a/Include/errcode.h b/Include/errcode.h index b37cd261d5ec4d..790518b8b7730e 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -29,7 +29,6 @@ extern "C" { #define E_EOFS 23 /* EOF in triple-quoted string */ #define E_EOLS 24 /* EOL in single-quoted string */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */ -#define E_IDENTIFIER 26 /* Invalid characters in identifier */ #define E_BADSINGLE 27 /* Ill-formed single statement input */ #ifdef __cplusplus diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index ac5aa9a76efe7c..e0bb5b56b2614f 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -583,7 +583,7 @@ def test_missing_expression(self): ]) # Different error message is raised for other whitespace characters. - self.assertAllRaise(SyntaxError, 'invalid character in identifier', + self.assertAllRaise(SyntaxError, r"invalid non-printable character U\+00A0", ["f'''{\xa0}'''", "\xa0", ]) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index a0bd741c36ac29..5ca43461d9940d 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -57,6 +57,9 @@ def test_issue7820(self): # one byte in common with the UTF-16-LE BOM self.assertRaises(SyntaxError, eval, b'\xff\x20') + # one byte in common with the UTF-8 BOM + self.assertRaises(SyntaxError, eval, b'\xef\x20') + # two bytes in common with the UTF-8 BOM self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20') diff --git a/Lib/test/test_unicode_identifiers.py b/Lib/test/test_unicode_identifiers.py index 07332c4631903e..5b9ced5d1cb837 100644 --- a/Lib/test/test_unicode_identifiers.py +++ b/Lib/test/test_unicode_identifiers.py @@ -20,9 +20,11 @@ def test_non_bmp_normalized(self): def test_invalid(self): try: from test import badsyntax_3131 - except SyntaxError as s: - self.assertEqual(str(s), - "invalid character in identifier (badsyntax_3131.py, line 2)") + except SyntaxError as err: + self.assertEqual(str(err), + "invalid character '€' (U+20AC) (badsyntax_3131.py, line 2)") + self.assertEqual(err.lineno, 2) + self.assertEqual(err.offset, 1) else: self.fail("expected exception didn't occur") diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst new file mode 100644 index 00000000000000..5587d4f49ccf97 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-11-13-50-52.bpo-40593.yuOXj3.rst @@ -0,0 +1 @@ +Improved syntax errors for invalid characters in source code. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18b9458721de18..276547ca48a5b2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12309,31 +12309,22 @@ unicode_isnumeric_impl(PyObject *self) Py_RETURN_TRUE; } -int -PyUnicode_IsIdentifier(PyObject *self) +Py_ssize_t +_PyUnicode_ScanIdentifier(PyObject *self) { Py_ssize_t i; - int ready = PyUnicode_IS_READY(self); + if (PyUnicode_READY(self) == -1) + return -1; - Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self); + Py_ssize_t len = PyUnicode_GET_LENGTH(self); if (len == 0) { /* an empty string is not a valid identifier */ return 0; } - int kind = 0; - const void *data = NULL; - const wchar_t *wstr = NULL; - Py_UCS4 ch; - if (ready) { - kind = PyUnicode_KIND(self); - data = PyUnicode_DATA(self); - ch = PyUnicode_READ(kind, data, 0); - } - else { - wstr = _PyUnicode_WSTR(self); - ch = wstr[0]; - } + int kind = PyUnicode_KIND(self); + const void *data = PyUnicode_DATA(self); + Py_UCS4 ch = PyUnicode_READ(kind, data, 0); /* PEP 3131 says that the first character must be in XID_Start and subsequent characters in XID_Continue, and for the ASCII range, the 2.x rules apply (i.e @@ -12347,17 +12338,44 @@ PyUnicode_IsIdentifier(PyObject *self) } for (i = 1; i < len; i++) { - if (ready) { - ch = PyUnicode_READ(kind, data, i); + ch = PyUnicode_READ(kind, data, i); + if (!_PyUnicode_IsXidContinue(ch)) { + return i; } - else { - ch = wstr[i]; + } + return i; +} + +int +PyUnicode_IsIdentifier(PyObject *self) +{ + if (PyUnicode_IS_READY(self)) { + Py_ssize_t i = _PyUnicode_ScanIdentifier(self); + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + /* an empty string is not a valid identifier */ + return len && i == len; + } + else { + Py_ssize_t i, len = PyUnicode_GET_SIZE(self); + if (len == 0) { + /* an empty string is not a valid identifier */ + return 0; } - if (!_PyUnicode_IsXidContinue(ch)) { + + const wchar_t *wstr = _PyUnicode_WSTR(self); + Py_UCS4 ch = wstr[0]; + if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) { return 0; } + + for (i = 1; i < len; i++) { + ch = wstr[i]; + if (!_PyUnicode_IsXidContinue(ch)) { + return 0; + } + } + return 1; } - return 1; } /*[clinic input] diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c index c80f08668b07d6..5f8c862c1f88be 100644 --- a/Parser/pegen/pegen.c +++ b/Parser/pegen/pegen.c @@ -337,9 +337,6 @@ tokenizer_error(Parser *p) case E_TOKEN: msg = "invalid token"; break; - case E_IDENTIFIER: - msg = "invalid character in identifier"; - break; case E_EOFS: RAISE_SYNTAX_ERROR("EOF while scanning triple-quoted string literal"); return -1; diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 0f2b6af5e50adf..b81fa118f216eb 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1101,25 +1101,53 @@ static int verify_identifier(struct tok_state *tok) { PyObject *s; - int result; if (tok->decoding_erred) return 0; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); if (s == NULL) { if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { - PyErr_Clear(); - tok->done = E_IDENTIFIER; - } else { + tok->done = E_DECODE; + } + else { tok->done = E_ERROR; } return 0; } - result = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - if (result == 0) { - tok->done = E_IDENTIFIER; + Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); + if (invalid < 0) { + Py_DECREF(s); + tok->done = E_ERROR; + return 0; } - return result; + assert(PyUnicode_GET_LENGTH(s) > 0); + if (invalid < PyUnicode_GET_LENGTH(s)) { + Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); + if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { + /* Determine the offset in UTF-8 encoded input */ + Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); + if (s != NULL) { + Py_SETREF(s, PyUnicode_AsUTF8String(s)); + } + if (s == NULL) { + tok->done = E_ERROR; + return 0; + } + tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s); + } + Py_DECREF(s); + // PyUnicode_FromFormatV() does not support %X + char hex[9]; + snprintf(hex, sizeof(hex), "%04X", ch); + if (Py_UNICODE_ISPRINTABLE(ch)) { + syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); + } + else { + syntaxerror(tok, "invalid non-printable character U+%s", hex); + } + return 0; + } + Py_DECREF(s); + return 1; } static int diff --git a/Python/pythonrun.c b/Python/pythonrun.c index 1b79a33c814da1..45f08b707eb999 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -1603,9 +1603,6 @@ err_input(perrdetail *err) msg = "unexpected character after line continuation character"; break; - case E_IDENTIFIER: - msg = "invalid character in identifier"; - break; case E_BADSINGLE: msg = "multiple statements found while compiling a single statement"; break;