Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-99612: Fix PyUnicode_DecodeUTF8Stateful() for ASCII-only data #99613

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,53 @@

class CAPITest(unittest.TestCase):

@support.cpython_only
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think of moving these tests to test_codecs? For me, these APIs are more related to codecs than "Unicode". test_codecs already has multiple tests about the UTF-8 encoding.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. But would not be more convenient to have tests for all PyUnicode_* functions in a single file?

@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_decodeutf8(self):
"""Test PyUnicode_DecodeUTF8()"""
from _testcapi import unicode_decodeutf8 as decodeutf8

for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600']:
b = s.encode('utf-8')
self.assertEqual(decodeutf8(b), s)
self.assertEqual(decodeutf8(b, 'strict'), s)

self.assertRaises(UnicodeDecodeError, decodeutf8, b'\x80')
self.assertRaises(UnicodeDecodeError, decodeutf8, b'\xc0')
self.assertRaises(UnicodeDecodeError, decodeutf8, b'\xff')
self.assertRaises(UnicodeDecodeError, decodeutf8, b'a\xf0\x9f')
self.assertEqual(decodeutf8(b'a\xf0\x9f', 'replace'), 'a\ufffd')
self.assertEqual(decodeutf8(b'a\xf0\x9fb', 'replace'), 'a\ufffdb')

self.assertRaises(LookupError, decodeutf8, b'a\x80', 'foo')
# TODO: Test PyUnicode_DecodeUTF8() with NULL as data and
# negative size.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dislike TODO which stays forever. Please either address it or remove it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In #99594 you requested to split that PR on smaller parts. If address these TODOes, it can add more complexity to this PR, which is not directly related to the bug.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If remove them, some cases will never be covered by tests.


@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_decodeutf8stateful(self):
"""Test PyUnicode_DecodeUTF8Stateful()"""
from _testcapi import unicode_decodeutf8stateful as decodeutf8stateful

for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600']:
b = s.encode('utf-8')
self.assertEqual(decodeutf8stateful(b), (s, len(b)))
self.assertEqual(decodeutf8stateful(b, 'strict'), (s, len(b)))

self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\x80')
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\xc0')
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\xff')
self.assertEqual(decodeutf8stateful(b'a\xf0\x9f'), ('a', 1))
self.assertEqual(decodeutf8stateful(b'a\xf0\x9f', 'replace'), ('a', 1))
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'a\xf0\x9fb')
self.assertEqual(decodeutf8stateful(b'a\xf0\x9fb', 'replace'), ('a\ufffdb', 4))

self.assertRaises(LookupError, decodeutf8stateful, b'a\x80', 'foo')
# TODO: Test PyUnicode_DecodeUTF8Stateful() with NULL as data and
# negative size.
# TODO: Test PyUnicode_DecodeUTF8Stateful() with NULL as the address of
# "consumed".

# Test PyUnicode_FromFormat()
def test_from_format(self):
import_helper.import_module('ctypes')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix :c:func:`PyUnicode_DecodeUTF8Stateful` for ASCII-only data:
``*consumed`` was not set.
37 changes: 37 additions & 0 deletions Modules/_testcapi/unicode.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#define PY_SSIZE_T_CLEAN
#include "parts.h"

static struct PyModuleDef *_testcapimodule = NULL; // set at initialization
Expand Down Expand Up @@ -223,6 +224,40 @@ unicode_asutf8andsize(PyObject *self, PyObject *args)
return Py_BuildValue("(Nn)", result, utf8_len);
}

/* Test PyUnicode_DecodeUTF8() */
static PyObject *
unicode_decodeutf8(PyObject *self, PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;

if (!PyArg_ParseTuple(args, "y#|z", &data, &size, &errors))
return NULL;

return PyUnicode_DecodeUTF8(data, size, errors);
}

/* Test PyUnicode_DecodeUTF8Stateful() */
static PyObject *
unicode_decodeutf8stateful(PyObject *self, PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
Py_ssize_t consumed;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would feel safer if you initialize the value. Maybe to a marker value like 42?

Otherwise, the test may miss the bug by luck, if local variables allocated on the stack are initialize to 0.

PyObject *result;

if (!PyArg_ParseTuple(args, "y#|z", &data, &size, &errors))
return NULL;

result = PyUnicode_DecodeUTF8Stateful(data, size, errors, &consumed);
if (!result) {
return NULL;
}
return Py_BuildValue("(Nn)", result, consumed);
}

static PyObject *
unicode_count(PyObject *self, PyObject *args)
{
Expand Down Expand Up @@ -716,6 +751,8 @@ static PyMethodDef TestMethods[] = {
{"unicode_asucs4", unicode_asucs4, METH_VARARGS},
{"unicode_asutf8", unicode_asutf8, METH_VARARGS},
{"unicode_asutf8andsize", unicode_asutf8andsize, METH_VARARGS},
{"unicode_decodeutf8", unicode_decodeutf8, METH_VARARGS},
{"unicode_decodeutf8stateful",unicode_decodeutf8stateful, METH_VARARGS},
{"unicode_count", unicode_count, METH_VARARGS},
{"unicode_findchar", unicode_findchar, METH_VARARGS},
{"unicode_copycharacters", unicode_copycharacters, METH_VARARGS},
Expand Down
3 changes: 3 additions & 0 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -4530,6 +4530,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
}
s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (s == end) {
if (consumed) {
*consumed = size;
}
return u;
}

Expand Down