Skip to content

bpo-36299: array('u') uses Py_UCS4 instead of Py_UNICODE #12497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 3 additions & 13 deletions Doc/library/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ defined:
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'B'`` | unsigned char | int | 1 | |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'u'`` | Py_UNICODE | Unicode character | 2 | \(1) |
| ``'u'`` | Py_UCS4 | Unicode character | 4 | |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'h'`` | signed short | int | 2 | |
+-----------+--------------------+-------------------+-----------------------+-------+
Expand All @@ -36,9 +36,9 @@ defined:
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'L'`` | unsigned long | int | 4 | |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'q'`` | signed long long | int | 8 | \(2) |
| ``'q'`` | signed long long | int | 8 | \(1) |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'Q'`` | unsigned long long | int | 8 | \(2) |
| ``'Q'`` | unsigned long long | int | 8 | \(1) |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'f'`` | float | float | 4 | |
+-----------+--------------------+-------------------+-----------------------+-------+
Expand All @@ -48,16 +48,6 @@ defined:
Notes:

(1)
The ``'u'`` type code corresponds to Python's obsolete unicode character
(:c:type:`Py_UNICODE` which is :c:type:`wchar_t`). Depending on the
platform, it can be 16 bits or 32 bits.

``'u'`` will be removed together with the rest of the :c:type:`Py_UNICODE`
API.

.. deprecated-removed:: 3.3 4.0

(2)
The ``'q'`` and ``'Q'`` type codes are available only if
the platform C compiler used to build Python supports C :c:type:`long long`,
or, on Windows, :c:type:`__int64`.
Expand Down
7 changes: 7 additions & 0 deletions Doc/whatsnew/3.8.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,13 @@ Improved Modules
release.


array
-----

``array('u')`` uses ``Py_UCS4`` instead of deprecated ``Py_UNICODE`` now.
And it is not deprecated now. (Contributed by Inada Naoki in :issue:`36299`.)


asyncio
-------

Expand Down
8 changes: 2 additions & 6 deletions Lib/test/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1123,12 +1123,8 @@ def test_unicode(self):

def test_issue17223(self):
# this used to crash
if sizeof_wchar == 4:
# U+FFFFFFFF is an invalid code point in Unicode 6.0
invalid_str = b'\xff\xff\xff\xff'
else:
# PyUnicode_FromUnicode() cannot fail with 16-bit wchar_t
self.skipTest("specific to 32-bit wchar_t")
# U+FFFFFFFF is an invalid code point in Unicode 6.0
invalid_str = b'\xff\xff\xff\xff'
a = array.array('u', invalid_str)
self.assertRaises(ValueError, a.tounicode)
self.assertRaises(ValueError, str, a)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
``array('u')`` uses ``Py_UCS4`` instead of deprecated ``Py_UNICODE`` now.
Patch by Inada Naoki.
96 changes: 47 additions & 49 deletions Modules/arraymodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#endif /* HAVE_SYS_TYPES_H */
#endif /* !STDC_HEADERS */

/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
#define MAX_UNICODE 0x10ffff

/*[clinic input]
module array
[clinic start generated code]*/
Expand Down Expand Up @@ -237,24 +240,26 @@ BB_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
static PyObject *
u_getitem(arrayobject *ap, Py_ssize_t i)
{
return PyUnicode_FromOrdinal(((Py_UNICODE *) ap->ob_item)[i]);
return PyUnicode_FromOrdinal(((Py_UCS4 *) ap->ob_item)[i]);
}

static int
u_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
{
Py_UNICODE *p;
Py_ssize_t len;

if (!PyArg_Parse(v, "u#;array item must be unicode character", &p, &len))
if (!PyUnicode_Check(v)) {
PyErr_SetString(PyExc_TypeError,
"array item must be unicode character");
return -1;
if (len != 1) {
}
if (PyUnicode_GetLength(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"array item must be unicode character");
return -1;
}
if (i >= 0)
((Py_UNICODE *)ap->ob_item)[i] = p[0];

if (i >= 0) {
((Py_UCS4 *)ap->ob_item)[i] = PyUnicode_ReadChar(v, 0);
}
return 0;
}

Expand Down Expand Up @@ -532,7 +537,7 @@ d_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)

DEFINE_COMPAREITEMS(b, signed char)
DEFINE_COMPAREITEMS(BB, unsigned char)
DEFINE_COMPAREITEMS(u, Py_UNICODE)
DEFINE_COMPAREITEMS(u, Py_UCS4)
DEFINE_COMPAREITEMS(h, short)
DEFINE_COMPAREITEMS(HH, unsigned short)
DEFINE_COMPAREITEMS(i, int)
Expand All @@ -550,7 +555,7 @@ DEFINE_COMPAREITEMS(QQ, unsigned long long)
static const struct arraydescr descriptors[] = {
{'b', 1, b_getitem, b_setitem, b_compareitems, "b", 1, 1},
{'B', 1, BB_getitem, BB_setitem, BB_compareitems, "B", 1, 0},
{'u', sizeof(Py_UNICODE), u_getitem, u_setitem, u_compareitems, "u", 0, 0},
{'u', sizeof(Py_UCS4), u_getitem, u_setitem, u_compareitems, "u", 0, 0},
{'h', sizeof(short), h_getitem, h_setitem, h_compareitems, "h", 1, 1},
{'H', sizeof(short), HH_getitem, HH_setitem, HH_compareitems, "H", 1, 0},
{'i', sizeof(int), i_getitem, i_setitem, i_compareitems, "i", 1, 1},
Expand Down Expand Up @@ -1701,7 +1706,7 @@ array_array_tostring_impl(arrayobject *self)
/*[clinic input]
array.array.fromunicode

ustr: Py_UNICODE(zeroes=True)
ustr: unicode
/

Extends this array with data from the unicode string ustr.
Expand All @@ -1712,25 +1717,25 @@ some other type.
[clinic start generated code]*/

static PyObject *
array_array_fromunicode_impl(arrayobject *self, const Py_UNICODE *ustr,
Py_ssize_clean_t ustr_length)
/*[clinic end generated code: output=cf2f662908e2befc input=150f00566ffbca6e]*/
array_array_fromunicode_impl(arrayobject *self, PyObject *ustr)
/*[clinic end generated code: output=24359f5e001a7f2b input=025db1fdade7a4ce]*/
{
char typecode;

typecode = self->ob_descr->typecode;
if (typecode != 'u') {
if (self->ob_descr->typecode != 'u') {
PyErr_SetString(PyExc_ValueError,
"fromunicode() may only be called on "
"unicode type arrays");
return NULL;
}

Py_ssize_t ustr_length = PyUnicode_GetLength(ustr);
if (ustr_length > 0) {
Py_ssize_t old_size = Py_SIZE(self);
if (array_resize(self, old_size + ustr_length) == -1)
return NULL;
memcpy(self->ob_item + old_size * sizeof(Py_UNICODE),
ustr, ustr_length * sizeof(Py_UNICODE));
if (PyUnicode_AsUCS4(ustr, ((Py_UCS4*)self->ob_item) + old_size,
ustr_length, 0) == NULL) {
return NULL;
}
}

Py_RETURN_NONE;
Expand All @@ -1750,14 +1755,21 @@ static PyObject *
array_array_tounicode_impl(arrayobject *self)
/*[clinic end generated code: output=08e442378336e1ef input=127242eebe70b66d]*/
{
char typecode;
typecode = self->ob_descr->typecode;
if (typecode != 'u') {
if (self->ob_descr->typecode != 'u') {
PyErr_SetString(PyExc_ValueError,
"tounicode() may only be called on unicode type arrays");
return NULL;
}
return PyUnicode_FromWideChar((Py_UNICODE *) self->ob_item, Py_SIZE(self));
Py_UCS4 *item = (Py_UCS4*)self->ob_item;
for (Py_ssize_t i = 0; i < Py_SIZE(self); i++) {
if (item[i] > MAX_UNICODE) {
PyErr_SetString(PyExc_ValueError,
"code point not in range(0x110000)");
return NULL;
}
}
return PyUnicode_FromKindAndData(
PyUnicode_4BYTE_KIND, self->ob_item, Py_SIZE(self));
}

/*[clinic input]
Expand Down Expand Up @@ -1828,13 +1840,7 @@ typecode_to_mformat_code(char typecode)
return UNSIGNED_INT8;

case 'u':
if (sizeof(Py_UNICODE) == 2) {
return UTF16_LE + is_big_endian;
}
if (sizeof(Py_UNICODE) == 4) {
return UTF32_LE + is_big_endian;
}
return UNKNOWN_FORMAT;
return UTF32_LE + is_big_endian;

case 'f':
if (sizeof(float) == 4) {
Expand Down Expand Up @@ -2585,11 +2591,9 @@ array_buffer_getbuf(arrayobject *self, Py_buffer *view, int flags)
view->internal = NULL;
if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
view->format = (char *)self->ob_descr->formats;
#ifdef Py_UNICODE_WIDE
if (self->ob_descr->typecode == 'u') {
view->format = "w";
view->format = "I";
}
#endif
}

self->ob_exports++;
Expand Down Expand Up @@ -2711,30 +2715,24 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
Py_DECREF(v);
}
else if (initial != NULL && PyUnicode_Check(initial)) {
Py_UNICODE *ustr;
Py_ssize_t n;

ustr = PyUnicode_AsUnicode(initial);
if (ustr == NULL) {
PyErr_NoMemory();
Py_DECREF(a);
return NULL;
}

n = PyUnicode_GET_DATA_SIZE(initial);
Py_ssize_t n = PyUnicode_GetLength(initial);
if (n > 0) {
arrayobject *self = (arrayobject *)a;
char *item = self->ob_item;
item = (char *)PyMem_Realloc(item, n);
item = (char *)PyMem_Realloc(item, n * sizeof(Py_UCS4));
if (item == NULL) {
PyErr_NoMemory();
Py_DECREF(a);
return NULL;
}
self->ob_item = item;
Py_SIZE(self) = n / sizeof(Py_UNICODE);
memcpy(item, ustr, n);
self->allocated = Py_SIZE(self);
self->allocated = n;

if (PyUnicode_AsUCS4(initial, (Py_UCS4*)item, n, 0) == NULL) {
Py_DECREF(a);
return NULL;
}
Py_SIZE(self) = n;
}
}
else if (initial != NULL && array_Check(initial) && len > 0) {
Expand Down
17 changes: 10 additions & 7 deletions Modules/clinic/arraymodule.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.