-
-
Notifications
You must be signed in to change notification settings - Fork 31.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
gh-99612: Fix PyUnicode_DecodeUTF8Stateful() for ASCII-only data #99613
Changes from 1 commit
8ff13de
b99be83
3d23080
1c07d9f
fd8c21d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,53 @@ | |
|
||
class CAPITest(unittest.TestCase): | ||
|
||
@support.cpython_only | ||
@unittest.skipIf(_testcapi is None, 'need _testcapi module') | ||
def test_decodeutf8(self): | ||
"""Test PyUnicode_DecodeUTF8()""" | ||
from _testcapi import unicode_decodeutf8 as decodeutf8 | ||
|
||
for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600']: | ||
b = s.encode('utf-8') | ||
self.assertEqual(decodeutf8(b), s) | ||
self.assertEqual(decodeutf8(b, 'strict'), s) | ||
|
||
self.assertRaises(UnicodeDecodeError, decodeutf8, b'\x80') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8, b'\xc0') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8, b'\xff') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8, b'a\xf0\x9f') | ||
self.assertEqual(decodeutf8(b'a\xf0\x9f', 'replace'), 'a\ufffd') | ||
self.assertEqual(decodeutf8(b'a\xf0\x9fb', 'replace'), 'a\ufffdb') | ||
|
||
self.assertRaises(LookupError, decodeutf8, b'a\x80', 'foo') | ||
# TODO: Test PyUnicode_DecodeUTF8() with NULL as data and | ||
# negative size. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dislike TODO which stays forever. Please either address it or remove it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In #99594 you requested to split that PR on smaller parts. If address these TODOes, it can add more complexity to this PR, which is not directly related to the bug. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If remove them, some cases will never be covered by tests. |
||
|
||
@support.cpython_only | ||
@unittest.skipIf(_testcapi is None, 'need _testcapi module') | ||
def test_decodeutf8stateful(self): | ||
"""Test PyUnicode_DecodeUTF8Stateful()""" | ||
from _testcapi import unicode_decodeutf8stateful as decodeutf8stateful | ||
|
||
for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600']: | ||
b = s.encode('utf-8') | ||
self.assertEqual(decodeutf8stateful(b), (s, len(b))) | ||
self.assertEqual(decodeutf8stateful(b, 'strict'), (s, len(b))) | ||
|
||
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\x80') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\xc0') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\xff') | ||
self.assertEqual(decodeutf8stateful(b'a\xf0\x9f'), ('a', 1)) | ||
self.assertEqual(decodeutf8stateful(b'a\xf0\x9f', 'replace'), ('a', 1)) | ||
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'a\xf0\x9fb') | ||
self.assertEqual(decodeutf8stateful(b'a\xf0\x9fb', 'replace'), ('a\ufffdb', 4)) | ||
|
||
self.assertRaises(LookupError, decodeutf8stateful, b'a\x80', 'foo') | ||
# TODO: Test PyUnicode_DecodeUTF8Stateful() with NULL as data and | ||
# negative size. | ||
# TODO: Test PyUnicode_DecodeUTF8Stateful() with NULL as the address of | ||
# "consumed". | ||
|
||
# Test PyUnicode_FromFormat() | ||
def test_from_format(self): | ||
import_helper.import_module('ctypes') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Fix :c:func:`PyUnicode_DecodeUTF8Stateful` for ASCII-only data: | ||
``*consumed`` was not set. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
#define PY_SSIZE_T_CLEAN | ||
#include "parts.h" | ||
|
||
static struct PyModuleDef *_testcapimodule = NULL; // set at initialization | ||
|
@@ -223,6 +224,40 @@ unicode_asutf8andsize(PyObject *self, PyObject *args) | |
return Py_BuildValue("(Nn)", result, utf8_len); | ||
} | ||
|
||
/* Test PyUnicode_DecodeUTF8() */ | ||
static PyObject * | ||
unicode_decodeutf8(PyObject *self, PyObject *args) | ||
{ | ||
const char *data; | ||
Py_ssize_t size; | ||
const char *errors = NULL; | ||
|
||
if (!PyArg_ParseTuple(args, "y#|z", &data, &size, &errors)) | ||
return NULL; | ||
|
||
return PyUnicode_DecodeUTF8(data, size, errors); | ||
} | ||
|
||
/* Test PyUnicode_DecodeUTF8Stateful() */ | ||
static PyObject * | ||
unicode_decodeutf8stateful(PyObject *self, PyObject *args) | ||
{ | ||
const char *data; | ||
Py_ssize_t size; | ||
const char *errors = NULL; | ||
Py_ssize_t consumed; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would feel safer if you initialize the value. Maybe to a marker value like 42? Otherwise, the test may miss the bug by luck, if local variables allocated on the stack are initialize to 0. |
||
PyObject *result; | ||
|
||
if (!PyArg_ParseTuple(args, "y#|z", &data, &size, &errors)) | ||
return NULL; | ||
|
||
result = PyUnicode_DecodeUTF8Stateful(data, size, errors, &consumed); | ||
if (!result) { | ||
return NULL; | ||
} | ||
return Py_BuildValue("(Nn)", result, consumed); | ||
} | ||
|
||
static PyObject * | ||
unicode_count(PyObject *self, PyObject *args) | ||
{ | ||
|
@@ -716,6 +751,8 @@ static PyMethodDef TestMethods[] = { | |
{"unicode_asucs4", unicode_asucs4, METH_VARARGS}, | ||
{"unicode_asutf8", unicode_asutf8, METH_VARARGS}, | ||
{"unicode_asutf8andsize", unicode_asutf8andsize, METH_VARARGS}, | ||
{"unicode_decodeutf8", unicode_decodeutf8, METH_VARARGS}, | ||
{"unicode_decodeutf8stateful",unicode_decodeutf8stateful, METH_VARARGS}, | ||
{"unicode_count", unicode_count, METH_VARARGS}, | ||
{"unicode_findchar", unicode_findchar, METH_VARARGS}, | ||
{"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you think of moving these tests to test_codecs? For me, these APIs are more related to codecs than "Unicode". test_codecs already has multiple tests about the UTF-8 encoding.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. But would not be more convenient to have tests for all
PyUnicode_*
functions in a single file?