Skip to content

Commit

Permalink
pythongh-119609: Add PyUnicode_Export() function
Browse files Browse the repository at this point in the history
Add PyUnicode_Export(), PyUnicode_GetBufferFormat() and
PyUnicode_Import() functions to the limited C API.
  • Loading branch information
vstinner committed Sep 5, 2024
1 parent 092abc4 commit f819661
Show file tree
Hide file tree
Showing 11 changed files with 586 additions and 4 deletions.
65 changes: 65 additions & 0 deletions Doc/c-api/unicode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,71 @@ APIs:
.. versionadded:: 3.3
.. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view)
Export the contents of the *unicode* string in one of the requested format
*requested_formats*.
* On success, fill *view*, and return ``0``.
* On error, set an exception and return ``-1``.
The export must be released by :c:func:`PyBuffer_Release`.
The contents of the buffer are valid until they are released.
The buffer is read-only and must not be modified.
*unicode* and *view* must not be NULL.
Available formats:
.. c:namespace:: NULL
=================================== ======== ===========================
Constant Identifier Value Description
=================================== ======== ===========================
.. c:macro:: PyUnicode_FORMAT_ASCII ``0x01`` ASCII string (``Py_UCS1*``)
.. c:macro:: PyUnicode_FORMAT_UCS1 ``0x02`` UCS-1 string (``Py_UCS1*``)
.. c:macro:: PyUnicode_FORMAT_UCS2 ``0x04`` UCS-2 string (``Py_UCS2*``)
.. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``)
.. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``)
=================================== ======== ===========================
*requested_formats* can be a single format or a bitwise combination of the
formats in the table above.
On success, *\*format* will be set to a single one of the requested flags.
Note that future versions of Python may introduce additional formats.
.. versionadded:: 3.14
.. c:function:: int PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format)
Get the format of the buffer *view*.
* On success, set *\*result* to the corresponding `PyUnicode_FORMAT_*` value
and return ``0``.
* On error, set an exception and return ``-1``.
*view* must be a buffer filled by :c:func:`PyUnicode_Export`.
.. versionadded:: 3.14
.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format)
Create a string object from a buffer in an “export format”.
* Return a reference to a new string object on success.
* Set an exception and return ``NULL`` on error.
*data* must not be NULL. *nbytes* must be positive or zero.
See :c:func:`PyUnicode_Export` for the available formats.
.. versionadded:: 3.14
.. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
Py_ssize_t size)
Expand Down
3 changes: 3 additions & 0 deletions Doc/data/stable_abi.dat

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Doc/whatsnew/3.14.rst
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,10 @@ New Features

(Contributed by Victor Stinner in :gh:`107954`.)

* Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`,
and :c:func:`PyUnicode_Import` functions to export and import strings.
(Contributed by Victor Stinner in :gh:`119609`.)


Porting to Python 3.14
----------------------
Expand Down
18 changes: 18 additions & 0 deletions Include/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,24 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
const char *u /* UTF-8 encoded string */
);

#define PyUnicode_FORMAT_ASCII 0x01 // Py_UCS1* (ASCII string)
#define PyUnicode_FORMAT_UCS1 0x02 // Py_UCS1*
#define PyUnicode_FORMAT_UCS2 0x04 // Py_UCS2*
#define PyUnicode_FORMAT_UCS4 0x08 // Py_UCS4*
#define PyUnicode_FORMAT_UTF8 0x10 // char*

PyAPI_FUNC(int) PyUnicode_Export(
PyObject *unicode,
uint32_t requested_formats,
Py_buffer *view);
PyAPI_FUNC(int) PyUnicode_GetBufferFormat(
const Py_buffer *view,
uint32_t *format);
PyAPI_FUNC(PyObject*) PyUnicode_Import(
const void *data,
Py_ssize_t nbytes,
uint32_t format);

/* --- wchar_t support for platforms which support it --------------------- */

#ifdef HAVE_WCHAR_H
Expand Down
183 changes: 180 additions & 3 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest
import struct
import sys
import unittest
from test import support
from test.support import import_helper

Expand Down Expand Up @@ -28,6 +29,14 @@ class Str(str):
pass


PyUnicode_FORMAT_ASCII = 0x01
PyUnicode_FORMAT_UCS1 = 0x02
PyUnicode_FORMAT_UCS2 = 0x04
PyUnicode_FORMAT_UCS4 = 0x08
PyUnicode_FORMAT_UTF8 = 0x10
# Invalid native format
PyUnicode_FORMAT_INVALID = 0x20

class CAPITest(unittest.TestCase):

@support.cpython_only
Expand Down Expand Up @@ -1721,6 +1730,142 @@ def test_pep393_utf8_caching_bug(self):
# Check that the second call returns the same result
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))

def test_unicode_export(self):
# Test PyUnicode_Export() and PyUnicode_FreeExport()
unicode_export = _testlimitedcapi.unicode_export
if sys.byteorder == 'little':
ucs2_enc = 'utf-16le'
ucs4_enc = 'utf-32le'
else:
ucs2_enc = 'utf-16be'
ucs4_enc = 'utf-32be'

# export to the native format
formats = (PyUnicode_FORMAT_ASCII
| PyUnicode_FORMAT_UCS1
| PyUnicode_FORMAT_UCS2
| PyUnicode_FORMAT_UCS4)
BUFFER_UCS1 = 'B'
BUFFER_UCS2 = 'H'
if struct.calcsize('I') == 4:
BUFFER_UCS4 = 'I'
elif struct.calcsize('L') == 4:
BUFFER_UCS4 = 'L'
else:
self.fail("unable to get BUFFER_UCS4 ")

def check_ucs1(text, formats):
if formats == PyUnicode_FORMAT_UCS1:
export_format = PyUnicode_FORMAT_UCS1
elif text.isascii():
export_format = PyUnicode_FORMAT_ASCII
else:
export_format = PyUnicode_FORMAT_UCS1
self.assertEqual(unicode_export(text, formats),
(text.encode('latin1'), export_format, 1, BUFFER_UCS1))

def check_ucs2(text, formats):
self.assertEqual(unicode_export(text, formats),
(text.encode(ucs2_enc),
PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2))

def check_ucs4(text, formats):
self.assertEqual(unicode_export(text, formats),
(text.encode(ucs4_enc),
PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4))

def check_utf8(text):
self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8),
(text.encode('utf8'),
PyUnicode_FORMAT_UTF8, 1, 'B'))

check_ucs1("abc", formats)
check_ucs1("latin1:\xe9", formats)
check_ucs2('ucs2:\u20ac', formats)
check_ucs4('ucs4:\U0010ffff', formats)

# export ASCII as UCS1
check_ucs1("abc", PyUnicode_FORMAT_UCS1)

# export ASCII and UCS1 to UCS2
check_ucs2("abc", PyUnicode_FORMAT_UCS2)
check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2)

# always export to UCS4
check_ucs4("abc", PyUnicode_FORMAT_UCS4)
check_ucs4("latin1:\xe9", PyUnicode_FORMAT_UCS4)
check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4)
check_ucs4('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4)

# always export to UTF8
check_utf8("abc")
check_utf8("latin1:\xe9")
check_utf8('ucs2:\u20ac')
check_utf8('ucs4:\U0010ffff')

# No supported format or invalid format
for formats in (0, PyUnicode_FORMAT_INVALID):
err_msg = "unable to find a matching export format"
with self.subTest(formats=formats):
with self.assertRaisesRegex(ValueError, err_msg):
unicode_export('abc', formats)

def test_unicode_import(self):
# Test PyUnicode_Import()
unicode_import = _testlimitedcapi.unicode_import
if sys.byteorder == 'little':
ucs2_enc = 'utf-16le'
ucs4_enc = 'utf-32le'
else:
ucs2_enc = 'utf-16be'
ucs4_enc = 'utf-32be'

self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII),
"abc")
self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1),
"latin1:\xe9")

self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc),
PyUnicode_FORMAT_UCS2),
'ucs2:\u20ac')

self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc),
PyUnicode_FORMAT_UCS4),
'ucs4:\U0010ffff')

text = "abc\xe9\U0010ffff"
self.assertEqual(unicode_import(text.encode('utf8'),
PyUnicode_FORMAT_UTF8),
text)

# Empty string
for native_format in (
PyUnicode_FORMAT_ASCII,
PyUnicode_FORMAT_UCS1,
PyUnicode_FORMAT_UCS2,
PyUnicode_FORMAT_UCS4,
PyUnicode_FORMAT_UTF8,
):
with self.subTest(native_format=native_format):
self.assertEqual(unicode_import(b'', native_format),
'')

# Invalid format
with self.assertRaises(ValueError):
unicode_import(b'', PyUnicode_FORMAT_INVALID)

# Invalid size
ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
with self.assertRaises(ValueError):
unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2)
ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
with self.assertRaises(ValueError):
unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4)
with self.assertRaises(ValueError):
unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4)
with self.assertRaises(ValueError):
unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4)


class PyUnicodeWriterTest(unittest.TestCase):
def create_writer(self, size):
Expand Down Expand Up @@ -1903,6 +2048,38 @@ def test_recover_error(self):

self.assertEqual(writer.finish(), 'Hello World.')


if __name__ == "__main__":
def test_unicode_export_import_roundtrip(self):
unicode_export = _testlimitedcapi.unicode_export
unicode_import = _testlimitedcapi.unicode_import

ASCII = PyUnicode_FORMAT_ASCII
UCS1 = PyUnicode_FORMAT_UCS1
UCS2 = PyUnicode_FORMAT_UCS2
UCS4 = PyUnicode_FORMAT_UCS4
UTF8 = PyUnicode_FORMAT_UTF8
ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8)

for string, allowed_formats in (
('', {ASCII, UCS1, UCS2, UCS4, UTF8}),
('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}),
('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}),
('ucs2:\u20ac', {UCS2, UCS4, UTF8}),
('ucs4:\U0001f638', {UCS4, UTF8}),
):
for format in ASCII, UCS1, UCS2, UCS4, UTF8:
with self.subTest(string=string, format=format):
if format not in allowed_formats:
with self.assertRaises(ValueError):
unicode_export(string, format)
else:
buf, buf_fmt, item_size, view_fmt = unicode_export(string, format)
restored = unicode_import(buf, buf_fmt)
self.assertEqual(restored, string)

buf, buf_fmt, item_size, view_fmt = unicode_export(string, ALL)
restored = unicode_import(buf, buf_fmt)
self.assertEqual(restored, string)


if __name__ == '__main__':
unittest.main()
3 changes: 3 additions & 0 deletions Lib/test/test_stable_abi_ctypes.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`, and
:c:func:`PyUnicode_Import` functions to export and import strings. Patch by
Victor Stinner.
16 changes: 16 additions & 0 deletions Misc/stable_abi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2526,3 +2526,19 @@
added = '3.14'
[function.PyLong_AsUInt64]
added = '3.14'
[const.PyUnicode_FORMAT_ASCII]
added = '3.14'
[const.PyUnicode_FORMAT_UCS1]
added = '3.14'
[const.PyUnicode_FORMAT_UCS2]
added = '3.14'
[const.PyUnicode_FORMAT_UCS4]
added = '3.14'
[const.PyUnicode_FORMAT_UTF8]
added = '3.14'
[function.PyUnicode_Export]
added = '3.14'
[function.PyUnicode_GetBufferFormat]
added = '3.14'
[function.PyUnicode_Import]
added = '3.14'
Loading

0 comments on commit f819661

Please sign in to comment.