From f8196614db84a4ae577161f8aff8157c84fef777 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 27 May 2024 16:21:18 +0200 Subject: [PATCH] gh-119609: Add PyUnicode_Export() function Add PyUnicode_Export(), PyUnicode_GetBufferFormat() and PyUnicode_Import() functions to the limited C API. --- Doc/c-api/unicode.rst | 65 +++++ Doc/data/stable_abi.dat | 3 + Doc/whatsnew/3.14.rst | 4 + Include/unicodeobject.h | 18 ++ Lib/test/test_capi/test_unicode.py | 183 ++++++++++++++- Lib/test/test_stable_abi_ctypes.py | 3 + ...-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst | 3 + Misc/stable_abi.toml | 16 ++ Modules/_testlimitedcapi/unicode.c | 70 ++++++ Objects/unicodeobject.c | 222 +++++++++++++++++- PC/python3dll.c | 3 + 11 files changed, 586 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 958fafd47ac81b5..603905d21555e5b 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,6 +341,71 @@ APIs: .. versionadded:: 3.3 +.. c:function:: int PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) + + Export the contents of the *unicode* string in one of the requested format + *requested_formats*. + + * On success, fill *view*, and return ``0``. + * On error, set an exception and return ``-1``. + + The export must be released by :c:func:`PyBuffer_Release`. + The contents of the buffer are valid until they are released. + + The buffer is read-only and must not be modified. + + *unicode* and *view* must not be NULL. + + Available formats: + + .. c:namespace:: NULL + + =================================== ======== =========================== + Constant Identifier Value Description + =================================== ======== =========================== + .. c:macro:: PyUnicode_FORMAT_ASCII ``0x01`` ASCII string (``Py_UCS1*``) + .. c:macro:: PyUnicode_FORMAT_UCS1 ``0x02`` UCS-1 string (``Py_UCS1*``) + .. c:macro:: PyUnicode_FORMAT_UCS2 ``0x04`` UCS-2 string (``Py_UCS2*``) + .. c:macro:: PyUnicode_FORMAT_UCS4 ``0x08`` UCS-4 string (``Py_UCS4*``) + .. c:macro:: PyUnicode_FORMAT_UTF8 ``0x10`` UTF-8 string (``char*``) + =================================== ======== =========================== + + *requested_formats* can be a single format or a bitwise combination of the + formats in the table above. + On success, *\*format* will be set to a single one of the requested flags. + + Note that future versions of Python may introduce additional formats. + + .. versionadded:: 3.14 + + +.. c:function:: int PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format) + + Get the format of the buffer *view*. + + * On success, set *\*result* to the corresponding `PyUnicode_FORMAT_*` value + and return ``0``. + * On error, set an exception and return ``-1``. + + *view* must be a buffer filled by :c:func:`PyUnicode_Export`. + + .. versionadded:: 3.14 + + +.. c:function:: PyObject* PyUnicode_Import(const void *data, Py_ssize_t nbytes, uint32_t format) + + Create a string object from a buffer in an “export format”. + + * Return a reference to a new string object on success. + * Set an exception and return ``NULL`` on error. + + *data* must not be NULL. *nbytes* must be positive or zero. + + See :c:func:`PyUnicode_Export` for the available formats. + + .. versionadded:: 3.14 + + .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \ Py_ssize_t size) diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 7eeee270bb7f322..a6745986c2025e6 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -784,6 +784,7 @@ func,PyUnicode_EncodeFSDefault,3.2,, func,PyUnicode_EncodeLocale,3.7,, func,PyUnicode_EqualToUTF8,3.13,, func,PyUnicode_EqualToUTF8AndSize,3.13,, +func,PyUnicode_Export,3.14,, func,PyUnicode_FSConverter,3.2,, func,PyUnicode_FSDecoder,3.2,, func,PyUnicode_Find,3.2,, @@ -797,8 +798,10 @@ func,PyUnicode_FromOrdinal,3.2,, func,PyUnicode_FromString,3.2,, func,PyUnicode_FromStringAndSize,3.2,, func,PyUnicode_FromWideChar,3.2,, +func,PyUnicode_GetBufferFormat,3.14,, func,PyUnicode_GetDefaultEncoding,3.2,, func,PyUnicode_GetLength,3.7,, +func,PyUnicode_Import,3.14,, func,PyUnicode_InternFromString,3.2,, func,PyUnicode_InternInPlace,3.2,, func,PyUnicode_IsIdentifier,3.2,, diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index e1bd52370d776c4..1d5e2a10b1b6dca 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -529,6 +529,10 @@ New Features (Contributed by Victor Stinner in :gh:`107954`.) +* Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`, + and :c:func:`PyUnicode_Import` functions to export and import strings. + (Contributed by Victor Stinner in :gh:`119609`.) + Porting to Python 3.14 ---------------------- diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index dee00715b3c51d5..75d41a90ae65d7d 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -248,6 +248,24 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( const char *u /* UTF-8 encoded string */ ); +#define PyUnicode_FORMAT_ASCII 0x01 // Py_UCS1* (ASCII string) +#define PyUnicode_FORMAT_UCS1 0x02 // Py_UCS1* +#define PyUnicode_FORMAT_UCS2 0x04 // Py_UCS2* +#define PyUnicode_FORMAT_UCS4 0x08 // Py_UCS4* +#define PyUnicode_FORMAT_UTF8 0x10 // char* + +PyAPI_FUNC(int) PyUnicode_Export( + PyObject *unicode, + uint32_t requested_formats, + Py_buffer *view); +PyAPI_FUNC(int) PyUnicode_GetBufferFormat( + const Py_buffer *view, + uint32_t *format); +PyAPI_FUNC(PyObject*) PyUnicode_Import( + const void *data, + Py_ssize_t nbytes, + uint32_t format); + /* --- wchar_t support for platforms which support it --------------------- */ #ifdef HAVE_WCHAR_H diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index e6f854272149587..19397cb30715ddb 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1,5 +1,6 @@ -import unittest +import struct import sys +import unittest from test import support from test.support import import_helper @@ -28,6 +29,14 @@ class Str(str): pass +PyUnicode_FORMAT_ASCII = 0x01 +PyUnicode_FORMAT_UCS1 = 0x02 +PyUnicode_FORMAT_UCS2 = 0x04 +PyUnicode_FORMAT_UCS4 = 0x08 +PyUnicode_FORMAT_UTF8 = 0x10 +# Invalid native format +PyUnicode_FORMAT_INVALID = 0x20 + class CAPITest(unittest.TestCase): @support.cpython_only @@ -1721,6 +1730,142 @@ def test_pep393_utf8_caching_bug(self): # Check that the second call returns the same result self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) + def test_unicode_export(self): + # Test PyUnicode_Export() and PyUnicode_FreeExport() + unicode_export = _testlimitedcapi.unicode_export + if sys.byteorder == 'little': + ucs2_enc = 'utf-16le' + ucs4_enc = 'utf-32le' + else: + ucs2_enc = 'utf-16be' + ucs4_enc = 'utf-32be' + + # export to the native format + formats = (PyUnicode_FORMAT_ASCII + | PyUnicode_FORMAT_UCS1 + | PyUnicode_FORMAT_UCS2 + | PyUnicode_FORMAT_UCS4) + BUFFER_UCS1 = 'B' + BUFFER_UCS2 = 'H' + if struct.calcsize('I') == 4: + BUFFER_UCS4 = 'I' + elif struct.calcsize('L') == 4: + BUFFER_UCS4 = 'L' + else: + self.fail("unable to get BUFFER_UCS4 ") + + def check_ucs1(text, formats): + if formats == PyUnicode_FORMAT_UCS1: + export_format = PyUnicode_FORMAT_UCS1 + elif text.isascii(): + export_format = PyUnicode_FORMAT_ASCII + else: + export_format = PyUnicode_FORMAT_UCS1 + self.assertEqual(unicode_export(text, formats), + (text.encode('latin1'), export_format, 1, BUFFER_UCS1)) + + def check_ucs2(text, formats): + self.assertEqual(unicode_export(text, formats), + (text.encode(ucs2_enc), + PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2)) + + def check_ucs4(text, formats): + self.assertEqual(unicode_export(text, formats), + (text.encode(ucs4_enc), + PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) + + def check_utf8(text): + self.assertEqual(unicode_export(text, PyUnicode_FORMAT_UTF8), + (text.encode('utf8'), + PyUnicode_FORMAT_UTF8, 1, 'B')) + + check_ucs1("abc", formats) + check_ucs1("latin1:\xe9", formats) + check_ucs2('ucs2:\u20ac', formats) + check_ucs4('ucs4:\U0010ffff', formats) + + # export ASCII as UCS1 + check_ucs1("abc", PyUnicode_FORMAT_UCS1) + + # export ASCII and UCS1 to UCS2 + check_ucs2("abc", PyUnicode_FORMAT_UCS2) + check_ucs2("latin1:\xe9", PyUnicode_FORMAT_UCS2) + + # always export to UCS4 + check_ucs4("abc", PyUnicode_FORMAT_UCS4) + check_ucs4("latin1:\xe9", PyUnicode_FORMAT_UCS4) + check_ucs4('ucs2:\u20ac', PyUnicode_FORMAT_UCS4) + check_ucs4('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4) + + # always export to UTF8 + check_utf8("abc") + check_utf8("latin1:\xe9") + check_utf8('ucs2:\u20ac') + check_utf8('ucs4:\U0010ffff') + + # No supported format or invalid format + for formats in (0, PyUnicode_FORMAT_INVALID): + err_msg = "unable to find a matching export format" + with self.subTest(formats=formats): + with self.assertRaisesRegex(ValueError, err_msg): + unicode_export('abc', formats) + + def test_unicode_import(self): + # Test PyUnicode_Import() + unicode_import = _testlimitedcapi.unicode_import + if sys.byteorder == 'little': + ucs2_enc = 'utf-16le' + ucs4_enc = 'utf-32le' + else: + ucs2_enc = 'utf-16be' + ucs4_enc = 'utf-32be' + + self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII), + "abc") + self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1), + "latin1:\xe9") + + self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc), + PyUnicode_FORMAT_UCS2), + 'ucs2:\u20ac') + + self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc), + PyUnicode_FORMAT_UCS4), + 'ucs4:\U0010ffff') + + text = "abc\xe9\U0010ffff" + self.assertEqual(unicode_import(text.encode('utf8'), + PyUnicode_FORMAT_UTF8), + text) + + # Empty string + for native_format in ( + PyUnicode_FORMAT_ASCII, + PyUnicode_FORMAT_UCS1, + PyUnicode_FORMAT_UCS2, + PyUnicode_FORMAT_UCS4, + PyUnicode_FORMAT_UTF8, + ): + with self.subTest(native_format=native_format): + self.assertEqual(unicode_import(b'', native_format), + '') + + # Invalid format + with self.assertRaises(ValueError): + unicode_import(b'', PyUnicode_FORMAT_INVALID) + + # Invalid size + ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc) + with self.assertRaises(ValueError): + unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2) + ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc) + with self.assertRaises(ValueError): + unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4) + with self.assertRaises(ValueError): + unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4) + with self.assertRaises(ValueError): + unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4) + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): @@ -1903,6 +2048,38 @@ def test_recover_error(self): self.assertEqual(writer.finish(), 'Hello World.') - -if __name__ == "__main__": + def test_unicode_export_import_roundtrip(self): + unicode_export = _testlimitedcapi.unicode_export + unicode_import = _testlimitedcapi.unicode_import + + ASCII = PyUnicode_FORMAT_ASCII + UCS1 = PyUnicode_FORMAT_UCS1 + UCS2 = PyUnicode_FORMAT_UCS2 + UCS4 = PyUnicode_FORMAT_UCS4 + UTF8 = PyUnicode_FORMAT_UTF8 + ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8) + + for string, allowed_formats in ( + ('', {ASCII, UCS1, UCS2, UCS4, UTF8}), + ('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}), + ('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}), + ('ucs2:\u20ac', {UCS2, UCS4, UTF8}), + ('ucs4:\U0001f638', {UCS4, UTF8}), + ): + for format in ASCII, UCS1, UCS2, UCS4, UTF8: + with self.subTest(string=string, format=format): + if format not in allowed_formats: + with self.assertRaises(ValueError): + unicode_export(string, format) + else: + buf, buf_fmt, item_size, view_fmt = unicode_export(string, format) + restored = unicode_import(buf, buf_fmt) + self.assertEqual(restored, string) + + buf, buf_fmt, item_size, view_fmt = unicode_export(string, ALL) + restored = unicode_import(buf, buf_fmt) + self.assertEqual(restored, string) + + +if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index 4bca33b7451f80b..b496b43d4ef6cdb 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -806,6 +806,7 @@ def test_windows_feature_macros(self): "PyUnicode_EncodeLocale", "PyUnicode_EqualToUTF8", "PyUnicode_EqualToUTF8AndSize", + "PyUnicode_Export", "PyUnicode_FSConverter", "PyUnicode_FSDecoder", "PyUnicode_Find", @@ -819,9 +820,11 @@ def test_windows_feature_macros(self): "PyUnicode_FromString", "PyUnicode_FromStringAndSize", "PyUnicode_FromWideChar", + "PyUnicode_GetBufferFormat", "PyUnicode_GetDefaultEncoding", "PyUnicode_GetLength", "PyUnicode_GetSize", + "PyUnicode_Import", "PyUnicode_InternFromString", "PyUnicode_InternImmortal", "PyUnicode_InternInPlace", diff --git a/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst b/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst new file mode 100644 index 000000000000000..6d75f0c192bc858 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2024-09-05-16-57-51.gh-issue-119609.5EZ-kg.rst @@ -0,0 +1,3 @@ +Add :c:func:`PyUnicode_Export`, :c:func:`PyUnicode_GetBufferFormat`, and +:c:func:`PyUnicode_Import` functions to export and import strings. Patch by +Victor Stinner. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 8bf638c473c712f..7fb8971326a0649 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2526,3 +2526,19 @@ added = '3.14' [function.PyLong_AsUInt64] added = '3.14' +[const.PyUnicode_FORMAT_ASCII] + added = '3.14' +[const.PyUnicode_FORMAT_UCS1] + added = '3.14' +[const.PyUnicode_FORMAT_UCS2] + added = '3.14' +[const.PyUnicode_FORMAT_UCS4] + added = '3.14' +[const.PyUnicode_FORMAT_UTF8] + added = '3.14' +[function.PyUnicode_Export] + added = '3.14' +[function.PyUnicode_GetBufferFormat] + added = '3.14' +[function.PyUnicode_Import] + added = '3.14' diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 2b70d09108a3335..2f21d0a338fa938 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1837,6 +1837,74 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) #undef CHECK_FORMAT_0 } + +// Test PyUnicode_Export() +static PyObject* +unicode_export(PyObject *self, PyObject *args) +{ + PyObject *obj; + unsigned int requested_formats; + if (!PyArg_ParseTuple(args, "OI", &obj, &requested_formats)) { + return NULL; + } + + Py_buffer view; + if (PyUnicode_Export(obj, requested_formats, &view) < 0) { + return NULL; + } + uint32_t format; + if (PyUnicode_GetBufferFormat(&view, &format) < 0) { + return NULL; + } + + // Make sure that the exported string ends with a NUL character + char *data = view.buf; + Py_ssize_t nbytes = view.len * view.itemsize; + switch (format) + { + case PyUnicode_FORMAT_ASCII: + case PyUnicode_FORMAT_UCS1: + assert(data[nbytes] == 0); + break; + case PyUnicode_FORMAT_UCS2: + assert(data[nbytes] == 0); + assert(data[nbytes+1] == 0); + break; + case PyUnicode_FORMAT_UCS4: + assert(data[nbytes] == 0); + assert(data[nbytes+1] == 0); + assert(data[nbytes+2] == 0); + assert(data[nbytes+3] == 0); + break; + case PyUnicode_FORMAT_UTF8: + assert(data[nbytes] == 0); + break; + } + + assert(view.format != NULL); + PyObject *res = Py_BuildValue("y#Iis", + view.buf, view.len * view.itemsize, + (unsigned int)format, + (int)view.itemsize, view.format); + PyBuffer_Release(&view); + return res; +} + + +// Test PyUnicode_Import() +static PyObject* +unicode_import(PyObject *self, PyObject *args) +{ + const void *data; + Py_ssize_t nbytes; + unsigned int format; + if (!PyArg_ParseTuple(args, "y#I", &data, &nbytes, &format)) { + return NULL; + } + return PyUnicode_Import(data, nbytes, format); +} + + static PyMethodDef TestMethods[] = { {"codec_incrementalencoder", codec_incrementalencoder, METH_VARARGS}, {"codec_incrementaldecoder", codec_incrementaldecoder, METH_VARARGS}, @@ -1924,6 +1992,8 @@ static PyMethodDef TestMethods[] = { {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, {"unicode_isidentifier", unicode_isidentifier, METH_O}, + {"unicode_export", unicode_export, METH_VARARGS}, + {"unicode_import", unicode_import, METH_VARARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2494c989544ca01..fe34536ff86c797 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2332,6 +2332,222 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, } +int +unicode_export(PyObject *unicode, Py_buffer *view, + Py_ssize_t len, const void *buf, + int itemsize, const char *format, uint32_t internal_format) +{ + if (PyBuffer_FillInfo(view, unicode, (void*)buf, len, + 1, PyBUF_SIMPLE) < 0) { + return -1; + } + view->itemsize = itemsize; + view->format = (char*)format; + view->internal = (void*)(uintptr_t)internal_format; + return 0; +} + + +int +PyUnicode_Export(PyObject *unicode, uint32_t requested_formats, Py_buffer *view) +{ +#if SIZEOF_INT == 4 +# define BUFFER_UCS4 "I" +#elif SIZEOF_LONG == 4 +# define BUFFER_UCS4 "L" +#else +# error "unable to find BUFFER_UCS4" +#endif + + if (!PyUnicode_Check(unicode)) { + PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); + return -1; + } + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + + // Native ASCII + if (PyUnicode_IS_ASCII(unicode) + && (requested_formats & PyUnicode_FORMAT_ASCII)) + { + return unicode_export(unicode, view, + len, PyUnicode_1BYTE_DATA(unicode), + 1, "B", PyUnicode_FORMAT_ASCII); + } + + // Native UCS1 + int kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND + && (requested_formats & PyUnicode_FORMAT_UCS1)) + { + return unicode_export(unicode, view, + len, PyUnicode_1BYTE_DATA(unicode), + 1, "B", PyUnicode_FORMAT_UCS1); + } + + // Native UCS2 + if (kind == PyUnicode_2BYTE_KIND + && (requested_formats & PyUnicode_FORMAT_UCS2)) + { + return unicode_export(unicode, view, + len, PyUnicode_2BYTE_DATA(unicode), + 2, "H", PyUnicode_FORMAT_UCS2); + } + + // Convert ASCII or UCS1 to UCS2 + if (kind == PyUnicode_1BYTE_KIND + && requested_formats & PyUnicode_FORMAT_UCS2) + { + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + Py_UCS2 *ucs2 = PyMem_Malloc((len + 1) * sizeof(Py_UCS2)); + if (!ucs2) { + PyErr_NoMemory(); + return -1; + } + + _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS2, + PyUnicode_1BYTE_DATA(unicode), + PyUnicode_1BYTE_DATA(unicode) + len, + ucs2); + ucs2[len] = 0; + + return unicode_export(unicode, view, + len, ucs2, + 2, "H", PyUnicode_FORMAT_UCS2); + } + + // Native UCS4 + if (kind == PyUnicode_4BYTE_KIND + && (requested_formats & PyUnicode_FORMAT_UCS4)) + { + return unicode_export(unicode, view, + len, PyUnicode_4BYTE_DATA(unicode), + 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); + } + + // Convert ASCII, UCS1 or UCS2 to UCS4 + if (requested_formats & PyUnicode_FORMAT_UCS4) { + Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(unicode); + if (ucs4 == NULL) { + return -1; + } + return unicode_export(unicode, view, + len, ucs4, + 4, BUFFER_UCS4, PyUnicode_FORMAT_UCS4); + } + + // Encode UCS1, UCS2 or UCS4 to UTF-8 + if (requested_formats & PyUnicode_FORMAT_UTF8) { + Py_ssize_t nbytes; + const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &nbytes); + if (utf8 == NULL) { + return -1; + } + return unicode_export(unicode, view, + nbytes, utf8, + 1, "B", PyUnicode_FORMAT_UTF8); + } + + PyErr_Format(PyExc_ValueError, "unable to find a matching export format"); + return -1; + +#undef BUFFER_UCS4 +} + + +int +PyUnicode_GetBufferFormat(const Py_buffer *view, uint32_t *format) +{ + if (view->obj == NULL || !PyUnicode_Check(view->obj)) { + PyErr_SetString(PyExc_ValueError, "not a str export"); + return -1; + } + uintptr_t internal_format = (uintptr_t)view->internal; + + switch (internal_format) + { + case PyUnicode_FORMAT_ASCII: + case PyUnicode_FORMAT_UCS1: + case PyUnicode_FORMAT_UCS2: + case PyUnicode_FORMAT_UCS4: + case PyUnicode_FORMAT_UTF8: + break; + default: + PyErr_SetString(PyExc_ValueError, "invalid format"); + return -1; + } + *format = (uint32_t)internal_format; + return 0; +} + + +static void +unicode_releasebuffer(PyObject *unicode, Py_buffer *view) +{ + uintptr_t format = (uintptr_t)view->internal; + switch (format) + { + case PyUnicode_FORMAT_ASCII: + break; + case PyUnicode_FORMAT_UCS1: + break; + case PyUnicode_FORMAT_UCS2: + break; + case PyUnicode_FORMAT_UCS4: + if (PyUnicode_KIND(unicode) != PyUnicode_4BYTE_KIND) { + PyMem_Free(view->buf); + } + break; + case PyUnicode_FORMAT_UTF8: + break; + default: + // ignore silently an unknown format + break; + } +} + +PyObject* +PyUnicode_Import(const void *data, Py_ssize_t nbytes, + uint32_t format) +{ + if (nbytes < 0) { + PyErr_SetString(PyExc_ValueError, "Negative nbytes"); + return NULL; + } + + switch (format) + { + case PyUnicode_FORMAT_ASCII: + return PyUnicode_DecodeASCII((const char*)data, nbytes, NULL); + + case PyUnicode_FORMAT_UCS1: + return _PyUnicode_FromUCS1(data, nbytes); + + case PyUnicode_FORMAT_UCS2: + if (nbytes % 2) { + PyErr_Format(PyExc_ValueError, "nbytes must be a multiple of 2: %zd", + nbytes); + return NULL; + } + return _PyUnicode_FromUCS2(data, nbytes / 2); + + case PyUnicode_FORMAT_UCS4: + if (nbytes % 4) { + PyErr_Format(PyExc_ValueError, "nbytes must be a multiple of 4: %zd", + nbytes); + return NULL; + } + return _PyUnicode_FromUCS4(data, nbytes / 4); + + case PyUnicode_FORMAT_UTF8: + return PyUnicode_DecodeUTF8((const char*)data, nbytes, NULL); + + default: + PyErr_Format(PyExc_ValueError, "unknown format: %i", format); + return NULL; + } +} + + PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) { @@ -15248,6 +15464,10 @@ errors defaults to 'strict'."); static PyObject *unicode_iter(PyObject *seq); +static PyBufferProcs unicode_as_buffer = { + .bf_releasebuffer = unicode_releasebuffer, +}; + PyTypeObject PyUnicode_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) "str", /* tp_name */ @@ -15268,7 +15488,7 @@ PyTypeObject PyUnicode_Type = { (reprfunc) unicode_str, /* tp_str */ PyObject_GenericGetAttr, /* tp_getattro */ 0, /* tp_setattro */ - 0, /* tp_as_buffer */ + &unicode_as_buffer, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS | _Py_TPFLAGS_MATCH_SELF, /* tp_flags */ diff --git a/PC/python3dll.c b/PC/python3dll.c index 1845334b244d8c9..1bfa238eb7054d3 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -717,6 +717,7 @@ EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeLocale) EXPORT_FUNC(PyUnicode_EqualToUTF8) EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize) +EXPORT_FUNC(PyUnicode_Export) EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_Format) @@ -730,9 +731,11 @@ EXPORT_FUNC(PyUnicode_FromStringAndSize) EXPORT_FUNC(PyUnicode_FromWideChar) EXPORT_FUNC(PyUnicode_FSConverter) EXPORT_FUNC(PyUnicode_FSDecoder) +EXPORT_FUNC(PyUnicode_GetBufferFormat) EXPORT_FUNC(PyUnicode_GetDefaultEncoding) EXPORT_FUNC(PyUnicode_GetLength) EXPORT_FUNC(PyUnicode_GetSize) +EXPORT_FUNC(PyUnicode_Import) EXPORT_FUNC(PyUnicode_InternFromString) EXPORT_FUNC(PyUnicode_InternImmortal) EXPORT_FUNC(PyUnicode_InternInPlace)