From 2f8bf7784afaf10deef5406bffebcf513cf95212 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 27 May 2024 16:21:18 +0200 Subject: [PATCH] gh-119609: Add PyUnicode_AsNativeFormat() function Add PyUnicode_AsNativeFormat() and PyUnicode_FromNativeFormat() functions to the C API. --- Doc/c-api/unicode.rst | 47 +++++++++++ Doc/data/stable_abi.dat | 2 + Doc/whatsnew/3.14.rst | 6 ++ Include/unicodeobject.h | 22 +++++ Lib/test/test_capi/test_unicode.py | 81 +++++++++++++++++- Lib/test/test_stable_abi_ctypes.py | 2 + ...-05-27-17-46-17.gh-issue-119609.kPIx6S.rst | 3 + Misc/stable_abi.toml | 4 + Modules/_testlimitedcapi/unicode.c | 31 +++++++ Objects/unicodeobject.c | 83 +++++++++++++++++++ PC/python3dll.c | 2 + 11 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 7320d035bab513e..0f3b6c29200f34f 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -341,6 +341,53 @@ APIs: .. versionadded:: 3.3 +.. c:function:: const void* PyUnicode_AsNativeFormat(PyObject *unicode, Py_ssize_t *size, int *native_format) + + Get the contents of a string in its native format. + + * Return the contents, set *\*size* and *\*native_format* on success. + * Set an exception and return ``NULL`` on error. + + The contents is valid as long as *unicode* is valid. + + *unicode*, *size* and *native_format* must not be NULL. + + *\*native_format* is set to one of these native formats: + + .. c:namespace:: NULL + + ======================================== ===== ============================ + Constant Identifier Value Description + ======================================== ===== ============================ + .. c:macro:: PyUnicode_NATIVE_ASCII ``1`` ASCII string (``Py_UCS1*``) + .. c:macro:: PyUnicode_NATIVE_UCS1 ``2`` UCS-1 string (``Py_UCS1*``) + .. c:macro:: PyUnicode_NATIVE_UCS2 ``3`` UCS-2 string (``Py_UCS2*``) + .. c:macro:: PyUnicode_NATIVE_UCS4 ``4`` UCS-4 string (``Py_UCS4*``) + .. c:macro:: PyUnicode_NATIVE_UTF8 ``5`` UTF-8 string (``char*``) + ======================================== ===== ============================ + + .. impl-detail:: + In CPython, the :c:macro:`PyUnicode_NATIVE_UTF8` format is not used by + :c:func:`PyUnicode_AsNativeFormat`, but it's accepted by + :c:func:`PyUnicode_FromNativeFormat`. + + .. versionadded:: 3.14 + + +.. c:function:: PyObject* PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, int native_format) + + Create a string object from a native format string. + + * Return a reference to a new string object on success. + * Set an exception and return ``NULL`` on error. + + *data* must not be NULL. *size* must be positive or zero. + + See :c:func:`PyUnicode_AsNativeFormat` for the available native formats. + + .. versionadded:: 3.14 + + .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \ Py_ssize_t size) diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat index 76a035f194d9115..e4aef2ea0385e32 100644 --- a/Doc/data/stable_abi.dat +++ b/Doc/data/stable_abi.dat @@ -734,6 +734,7 @@ function,PyUnicode_AsEncodedString,3.2,, function,PyUnicode_AsEncodedUnicode,3.2,, function,PyUnicode_AsLatin1String,3.2,, function,PyUnicode_AsMBCSString,3.7,on Windows, +function,PyUnicode_AsNativeFormat,3.14,, function,PyUnicode_AsRawUnicodeEscapeString,3.2,, function,PyUnicode_AsUCS4,3.7,, function,PyUnicode_AsUCS4Copy,3.7,, @@ -784,6 +785,7 @@ function,PyUnicode_Format,3.2,, function,PyUnicode_FromEncodedObject,3.2,, function,PyUnicode_FromFormat,3.2,, function,PyUnicode_FromFormatV,3.2,, +function,PyUnicode_FromNativeFormat,3.14,, function,PyUnicode_FromObject,3.2,, function,PyUnicode_FromOrdinal,3.2,, function,PyUnicode_FromString,3.2,, diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index bc12d4b3b590ddd..280a4f058a446ad 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -217,6 +217,12 @@ C API Changes New Features ------------ +* Add :c:func:`PyUnicode_AsNativeFormat` and + :c:func:`PyUnicode_FromNativeFormat` functions to import and export strings + in their native format. + (Contributed by Victor Stinner in :gh:`119609`.) + + Porting to Python 3.14 ---------------------- diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index dee00715b3c51d5..a106b0aaf03ba83 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -248,6 +248,28 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( const char *u /* UTF-8 encoded string */ ); +#define PyUnicode_NATIVE_ASCII 1 +#define PyUnicode_NATIVE_UCS1 2 +#define PyUnicode_NATIVE_UCS2 3 +#define PyUnicode_NATIVE_UCS4 4 +#define PyUnicode_NATIVE_UTF8 5 + +// Get the content of a string in its native format. +// - Return the content, set '*size' and '*native_format' on success. +// - Set an exception and return NULL on error. +PyAPI_FUNC(const void*) PyUnicode_AsNativeFormat( + PyObject *unicode, + Py_ssize_t *size, + int *native_format); + +// Create a string object from a native format string. +// - Return a reference to a new string object on success. +// - Set an exception and return NULL on error. +PyAPI_FUNC(PyObject*) PyUnicode_FromNativeFormat( + const void *data, + Py_ssize_t size, + int native_format); + /* --- wchar_t support for platforms which support it --------------------- */ #ifdef HAVE_WCHAR_H diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a69f817c515ba7d..dda1dd116f0c048 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -24,6 +24,14 @@ class Str(str): pass +PyUnicode_NATIVE_ASCII = 1 +PyUnicode_NATIVE_UCS1 = 2 +PyUnicode_NATIVE_UCS2 = 3 +PyUnicode_NATIVE_UCS4 = 4 +PyUnicode_NATIVE_UTF8 = 5 +# Invalid native format +PyUnicode_NATIVE_INVALID = 0 + class CAPITest(unittest.TestCase): @support.cpython_only @@ -1675,6 +1683,75 @@ def test_pep393_utf8_caching_bug(self): # Check that the second call returns the same result self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1)) - -if __name__ == "__main__": + def test_unicode_asnativeformat(self): + # Test PyUnicode_AsNativeFormat() + asnativeformat = _testlimitedcapi.unicode_asnativeformat + self.assertEqual(asnativeformat("abc"), + (b'abc', PyUnicode_NATIVE_ASCII)) + self.assertEqual(asnativeformat("latin1:\xe9"), + (b'latin1:\xe9', PyUnicode_NATIVE_UCS1)) + + ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be' + self.assertEqual(asnativeformat('ucs2:\u20ac'), + ('ucs2:\u20ac'.encode(ucs2_enc), + PyUnicode_NATIVE_UCS2)) + + ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be' + self.assertEqual(asnativeformat('ucs4:\U0010ffff'), + ('ucs4:\U0010ffff'.encode(ucs4_enc), + PyUnicode_NATIVE_UCS4)) + + def test_unicode_fromnativeformat(self): + # Test PyUnicode_FromNativeFormat() + fromnativeformat = _testlimitedcapi.unicode_fromnativeformat + self.assertEqual(fromnativeformat(b'abc', PyUnicode_NATIVE_ASCII), + "abc") + self.assertEqual(fromnativeformat(b'latin1:\xe9', PyUnicode_NATIVE_UCS1), + "latin1:\xe9") + + ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be' + self.assertEqual(fromnativeformat('ucs2:\u20ac'.encode(ucs2_enc), + PyUnicode_NATIVE_UCS2), + 'ucs2:\u20ac') + + ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be' + self.assertEqual(fromnativeformat('ucs4:\U0010ffff'.encode(ucs4_enc), + PyUnicode_NATIVE_UCS4), + 'ucs4:\U0010ffff') + + text = "abc\xe9\U0010ffff" + self.assertEqual(fromnativeformat(text.encode('utf8'), + PyUnicode_NATIVE_UTF8), + text) + + # Empty string + for native_format in ( + PyUnicode_NATIVE_ASCII, + PyUnicode_NATIVE_UCS1, + PyUnicode_NATIVE_UCS2, + PyUnicode_NATIVE_UCS4, + PyUnicode_NATIVE_UTF8, + ): + with self.subTest(native_format=native_format): + self.assertEqual(fromnativeformat(b'', native_format), + '') + + # Invalid format + with self.assertRaises(ValueError): + fromnativeformat(b'', PyUnicode_NATIVE_INVALID) + + # Invalid size + ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc) + with self.assertRaises(ValueError): + fromnativeformat(ucs2[:-1], PyUnicode_NATIVE_UCS2) + ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc) + with self.assertRaises(ValueError): + fromnativeformat(ucs4[:-1], PyUnicode_NATIVE_UCS4) + with self.assertRaises(ValueError): + fromnativeformat(ucs4[:-2], PyUnicode_NATIVE_UCS4) + with self.assertRaises(ValueError): + fromnativeformat(ucs4[:-3], PyUnicode_NATIVE_UCS4) + + +if __name__ == '__main__': unittest.main() diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py index c06c285c5013a61..99bc693448f122f 100644 --- a/Lib/test/test_stable_abi_ctypes.py +++ b/Lib/test/test_stable_abi_ctypes.py @@ -760,6 +760,7 @@ def test_windows_feature_macros(self): "PyUnicode_AsEncodedString", "PyUnicode_AsEncodedUnicode", "PyUnicode_AsLatin1String", + "PyUnicode_AsNativeFormat", "PyUnicode_AsRawUnicodeEscapeString", "PyUnicode_AsUCS4", "PyUnicode_AsUCS4Copy", @@ -806,6 +807,7 @@ def test_windows_feature_macros(self): "PyUnicode_FromEncodedObject", "PyUnicode_FromFormat", "PyUnicode_FromFormatV", + "PyUnicode_FromNativeFormat", "PyUnicode_FromObject", "PyUnicode_FromOrdinal", "PyUnicode_FromString", diff --git a/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst b/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst new file mode 100644 index 000000000000000..06f9a061ec8ac09 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst @@ -0,0 +1,3 @@ +Add :c:func:`PyUnicode_AsNativeFormat` and +:c:func:`PyUnicode_FromNativeFormat` functions to import and export strings +in their native format. Patch by Victor Stinner. diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml index 77473662aaa76ca..5fe199be27f79de 100644 --- a/Misc/stable_abi.toml +++ b/Misc/stable_abi.toml @@ -2507,3 +2507,7 @@ added = '3.13' [function.PyEval_GetFrameLocals] added = '3.13' +[function.PyUnicode_AsNativeFormat] + added = '3.14' +[function.PyUnicode_FromNativeFormat] + added = '3.14' diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c index 2b70d09108a3335..66da5b1d1846b40 100644 --- a/Modules/_testlimitedcapi/unicode.c +++ b/Modules/_testlimitedcapi/unicode.c @@ -1837,6 +1837,35 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored)) #undef CHECK_FORMAT_0 } + +// Test PyUnicode_AsNativeFormat() +static PyObject* +unicode_asnativeformat(PyObject *self, PyObject *obj) +{ + Py_ssize_t size; + int native_format; + const void *data = PyUnicode_AsNativeFormat(obj, &size, &native_format); + if (data == NULL) { + return NULL; + } + return Py_BuildValue("y#i", data, size, native_format); +} + + +// Test PyUnicode_FromNativeFormat() +static PyObject* +unicode_fromnativeformat(PyObject *self, PyObject *args) +{ + const void *data; + Py_ssize_t size; + int native_format; + if (!PyArg_ParseTuple(args, "y#i", &data, &size, &native_format)) { + return NULL; + } + return PyUnicode_FromNativeFormat(data, size, native_format); +} + + static PyMethodDef TestMethods[] = { {"codec_incrementalencoder", codec_incrementalencoder, METH_VARARGS}, {"codec_incrementaldecoder", codec_incrementaldecoder, METH_VARARGS}, @@ -1924,6 +1953,8 @@ static PyMethodDef TestMethods[] = { {"unicode_format", unicode_format, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS}, {"unicode_isidentifier", unicode_isidentifier, METH_O}, + {"unicode_asnativeformat", unicode_asnativeformat, METH_O}, + {"unicode_fromnativeformat", unicode_fromnativeformat, METH_VARARGS}, {NULL}, }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 048f9a814c30af1..74dd1996242e3d6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2094,6 +2094,89 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) return res; } +const void* +PyUnicode_AsNativeFormat(PyObject *unicode, + Py_ssize_t *size, int *native_format) +{ + if (!PyUnicode_Check(unicode)) { + *size = 0; + *native_format = 0; + PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode); + return NULL; + } + + Py_ssize_t len = PyUnicode_GET_LENGTH(unicode); + + if (PyUnicode_IS_ASCII(unicode)) { + *native_format = PyUnicode_NATIVE_ASCII; + *size = len; + return PyUnicode_1BYTE_DATA(unicode); + } + int kind = PyUnicode_KIND(unicode); + + switch (kind) + { + case PyUnicode_1BYTE_KIND: + *native_format = PyUnicode_NATIVE_UCS1; + *size = len; + return PyUnicode_1BYTE_DATA(unicode); + + case PyUnicode_2BYTE_KIND: + *native_format = PyUnicode_NATIVE_UCS2; + *size = len * 2; + return PyUnicode_2BYTE_DATA(unicode); + + default: + assert(kind == PyUnicode_4BYTE_KIND); + *native_format = PyUnicode_NATIVE_UCS4; + *size = len * 4; + return PyUnicode_4BYTE_DATA(unicode); + } +} + +PyObject* +PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, + int native_format) +{ + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "Negative size"); + return NULL; + } + + switch (native_format) + { + case PyUnicode_NATIVE_ASCII: + return PyUnicode_DecodeASCII((const char*)data, size, NULL); + + case PyUnicode_NATIVE_UCS1: + return _PyUnicode_FromUCS1(data, size); + + case PyUnicode_NATIVE_UCS2: + if (size % 2) { + PyErr_Format(PyExc_ValueError, "size must be a multiple of 2: %zd", + size); + return NULL; + } + return _PyUnicode_FromUCS2(data, size / 2); + + case PyUnicode_NATIVE_UCS4: + if (size % 4) { + PyErr_Format(PyExc_ValueError, "size must be a multiple of 4: %zd", + size); + return NULL; + } + return _PyUnicode_FromUCS4(data, size / 4); + + case PyUnicode_NATIVE_UTF8: + return PyUnicode_DecodeUTF8((const char*)data, size, NULL); + + default: + PyErr_Format(PyExc_ValueError, "unknown native format %i", + native_format); + return NULL; + } +} + PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) { diff --git a/PC/python3dll.c b/PC/python3dll.c index 86c888430891c9a..ca558c6fcf56fe9 100755 --- a/PC/python3dll.c +++ b/PC/python3dll.c @@ -665,6 +665,7 @@ EXPORT_FUNC(PyUnicode_AsEncodedString) EXPORT_FUNC(PyUnicode_AsEncodedUnicode) EXPORT_FUNC(PyUnicode_AsLatin1String) EXPORT_FUNC(PyUnicode_AsMBCSString) +EXPORT_FUNC(PyUnicode_AsNativeFormat) EXPORT_FUNC(PyUnicode_AsRawUnicodeEscapeString) EXPORT_FUNC(PyUnicode_AsUCS4) EXPORT_FUNC(PyUnicode_AsUCS4Copy) @@ -713,6 +714,7 @@ EXPORT_FUNC(PyUnicode_Format) EXPORT_FUNC(PyUnicode_FromEncodedObject) EXPORT_FUNC(PyUnicode_FromFormat) EXPORT_FUNC(PyUnicode_FromFormatV) +EXPORT_FUNC(PyUnicode_FromNativeFormat) EXPORT_FUNC(PyUnicode_FromObject) EXPORT_FUNC(PyUnicode_FromOrdinal) EXPORT_FUNC(PyUnicode_FromString)