diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 75a74ffa2f9dff5..7b5a7fdb9c371c9 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -97,46 +97,41 @@ typedef struct { PyObject_HEAD Py_ssize_t length; /* Number of code points in the string */ Py_hash_t hash; /* Hash value; -1 if not set */ - struct { - /* If interned is set, the two references from the - dictionary to this object are *not* counted in ob_refcnt. */ - unsigned int interned:1; - /* Character size: - - - PyUnicode_1BYTE_KIND (1): - - * character type = Py_UCS1 (8 bits, unsigned) - * all characters are in the range U+0000-U+00FF (latin1) - * if ascii is set, all characters are in the range U+0000-U+007F - (ASCII), otherwise at least one character is in the range - U+0080-U+00FF - - - PyUnicode_2BYTE_KIND (2): - - * character type = Py_UCS2 (16 bits, unsigned) - * all characters are in the range U+0000-U+FFFF (BMP) - * at least one character is in the range U+0100-U+FFFF - - - PyUnicode_4BYTE_KIND (4): - - * character type = Py_UCS4 (32 bits, unsigned) - * all characters are in the range U+0000-U+10FFFF - * at least one character is in the range U+10000-U+10FFFF - */ - unsigned int kind:3; - /* Compact is with respect to the allocation scheme. Compact unicode - objects only require one memory block while non-compact objects use - one block for the PyUnicodeObject struct and another for its data - buffer. */ - unsigned int compact:1; - /* The string only contains characters in the range U+0000-U+007F (ASCII) - and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is - set, use the PyASCIIObject structure. */ - unsigned int ascii:1; - /* Padding to ensure that PyUnicode_DATA() is always aligned to - 4 bytes (see issue #19537 on m68k). */ - unsigned int :26; - } state; + /* If interned is set, the two references from the + dictionary to this object are *not* counted in ob_refcnt. */ + uint8_t interned; + /* Character size: + + - PyUnicode_1BYTE_KIND (1): + + * character type = Py_UCS1 (8 bits, unsigned) + * all characters are in the range U+0000-U+00FF (latin1) + * if ascii is set, all characters are in the range U+0000-U+007F + (ASCII), otherwise at least one character is in the range + U+0080-U+00FF + + - PyUnicode_2BYTE_KIND (2): + + * character type = Py_UCS2 (16 bits, unsigned) + * all characters are in the range U+0000-U+FFFF (BMP) + * at least one character is in the range U+0100-U+FFFF + + - PyUnicode_4BYTE_KIND (4): + + * character type = Py_UCS4 (32 bits, unsigned) + * all characters are in the range U+0000-U+10FFFF + * at least one character is in the range U+10000-U+10FFFF + */ + uint8_t kind; + /* Compact is with respect to the allocation scheme. Compact unicode + objects only require one memory block while non-compact objects use + one block for the PyUnicodeObject struct and another for its data + buffer. */ + uint8_t compact; + /* The string only contains characters in the range U+0000-U+007F (ASCII) + and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is + set, use the PyASCIIObject structure. */ + uint8_t ascii; } PyASCIIObject; /* Non-ASCII strings allocated through PyUnicode_New use the @@ -178,15 +173,9 @@ PyAPI_FUNC(int) _PyUnicode_CheckConsistency( /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ -/* Values for PyASCIIObject.state: */ - -/* Interning state. */ -#define SSTATE_NOT_INTERNED 0 -#define SSTATE_INTERNED_MORTAL 1 - /* Use only if you know it's a string */ static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) { - return _PyASCIIObject_CAST(op)->state.interned; + return _PyASCIIObject_CAST(op)->interned; } #define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op)) @@ -200,21 +189,21 @@ static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) { string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be ready. */ static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) { - return _PyASCIIObject_CAST(op)->state.ascii; + return _PyASCIIObject_CAST(op)->ascii; } #define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op)) /* Return true if the string is compact or 0 if not. No type checks or Ready calls are performed. */ static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) { - return _PyASCIIObject_CAST(op)->state.compact; + return _PyASCIIObject_CAST(op)->compact; } #define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op)) /* Return true if the string is a compact ASCII string (use PyASCIIObject structure), or 0 if not. No type checks or Ready calls are performed. */ static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) { - return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op)); + return (_PyASCIIObject_CAST(op)->ascii && PyUnicode_IS_COMPACT(op)); } #define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op)) @@ -231,7 +220,7 @@ enum PyUnicode_Kind { // new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and // unsigned numbers) where kind type is an int or on // "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned). -#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind) +#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->kind) /* Return a void pointer to the raw unicode buffer. */ static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) { diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h index bdecac944dfd3a2..6f99c3baf347447 100644 --- a/Include/internal/pycore_runtime_init.h +++ b/Include/internal/pycore_runtime_init.h @@ -155,11 +155,9 @@ extern PyTypeObject _PyExc_MemoryError; .ob_base = _PyObject_IMMORTAL_INIT(&PyUnicode_Type), \ .length = sizeof(LITERAL) - 1, \ .hash = -1, \ - .state = { \ - .kind = 1, \ - .compact = 1, \ - .ascii = (ASCII), \ - }, \ + .kind = 1, \ + .compact = 1, \ + .ascii = (ASCII), \ } #define _PyASCIIObject_INIT(LITERAL) \ { \ diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index c34ee578b5c83f1..83cf667d4c9ffe6 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -1555,5 +1555,11 @@ def func2(x=None): self.do_test(func2) +class Test_UnicodeObjectAlignment(unittest.TestCase): + + def test_unicodeobject_data_alignment(self): + _testinternalcapi.check_compactunicodeobject_data_alignment() + + if __name__ == "__main__": unittest.main() diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 632fac2de0c419d..e788148f4b97ddd 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -684,6 +684,21 @@ clear_extension(PyObject *self, PyObject *args) Py_RETURN_NONE; } +static PyObject * +check_compactunicodeobject_data_alignment() +{ + size_t data_offset = sizeof(PyCompactUnicodeObject); + if (data_offset % 4 != 0) { + // This is required so that the data (which immediately follows a + // compact unicode offset) is correctly aligned in the largest case (UCS_4) + PyErr_Format(PyExc_AssertionError, + "PyCompactUnicodeObject size offset is %i, needs to be multiple of 4 bytes", + data_offset); + return NULL; + } + Py_RETURN_NONE; +} + static PyMethodDef module_functions[] = { {"get_configs", get_configs, METH_NOARGS}, @@ -707,6 +722,7 @@ static PyMethodDef module_functions[] = { _TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF {"get_interp_settings", get_interp_settings, METH_VARARGS, NULL}, {"clear_extension", clear_extension, METH_VARARGS, NULL}, + {"check_compactunicodeobject_data_alignment", check_compactunicodeobject_data_alignment, METH_NOARGS, NULL}, {NULL, NULL} /* sentinel */ }; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2d50f9c340f2f36..ad7faca47bc5114 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -129,13 +129,16 @@ extern "C" { #define _PyUnicode_LENGTH(op) \ (_PyASCIIObject_CAST(op)->length) -#define _PyUnicode_STATE(op) \ - (_PyASCIIObject_CAST(op)->state) #define _PyUnicode_HASH(op) \ (_PyASCIIObject_CAST(op)->hash) +#define _PyUnicode_INTERNED(op) \ + (_PyASCIIObject_CAST(op)->interned) #define _PyUnicode_KIND(op) \ - (assert(_PyUnicode_CHECK(op)), \ - _PyASCIIObject_CAST(op)->state.kind) + (_PyASCIIObject_CAST(op)->kind) +#define _PyUnicode_COMPACT(op) \ + (_PyASCIIObject_CAST(op)->compact) +#define _PyUnicode_ASCII(op) \ + (_PyASCIIObject_CAST(op)->ascii) #define _PyUnicode_GET_LENGTH(op) \ (assert(_PyUnicode_CHECK(op)), \ _PyASCIIObject_CAST(op)->length) @@ -497,21 +500,21 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) CHECK(PyUnicode_Check(op)); PyASCIIObject *ascii = _PyASCIIObject_CAST(op); - int kind = ascii->state.kind; + int kind = ascii->kind; - if (ascii->state.ascii == 1 && ascii->state.compact == 1) { + if (ascii->ascii == 1 && ascii->compact == 1) { CHECK(kind == PyUnicode_1BYTE_KIND); } else { PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op); void *data; - if (ascii->state.compact == 1) { + if (ascii->compact == 1) { data = compact + 1; CHECK(kind == PyUnicode_1BYTE_KIND || kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_4BYTE_KIND); - CHECK(ascii->state.ascii == 0); + CHECK(ascii->ascii == 0); CHECK(compact->utf8 != data); } else { @@ -521,9 +524,9 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) CHECK(kind == PyUnicode_1BYTE_KIND || kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_4BYTE_KIND); - CHECK(ascii->state.compact == 0); + CHECK(ascii->compact == 0); CHECK(data != NULL); - if (ascii->state.ascii) { + if (ascii->ascii) { CHECK(compact->utf8 == data); CHECK(compact->utf8_length == ascii->length); } @@ -551,7 +554,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) maxchar = ch; } if (kind == PyUnicode_1BYTE_KIND) { - if (ascii->state.ascii == 0) { + if (ascii->ascii == 0) { CHECK(maxchar >= 128); CHECK(maxchar <= 255); } @@ -1108,9 +1111,9 @@ _PyUnicode_Dump(PyObject *op) PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op); const void *data; - if (ascii->state.compact) + if (ascii->compact) { - if (ascii->state.ascii) + if (ascii->ascii) data = (ascii + 1); else data = (compact + 1); @@ -1119,7 +1122,7 @@ _PyUnicode_Dump(PyObject *op) data = unicode->data.any; printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length); - if (!ascii->state.ascii) { + if (!ascii->ascii) { printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length); } printf(", data=%p\n", data); @@ -1195,10 +1198,10 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) data = unicode + 1; _PyUnicode_LENGTH(unicode) = size; _PyUnicode_HASH(unicode) = -1; - _PyUnicode_STATE(unicode).interned = 0; - _PyUnicode_STATE(unicode).kind = kind; - _PyUnicode_STATE(unicode).compact = 1; - _PyUnicode_STATE(unicode).ascii = is_ascii; + _PyUnicode_INTERNED(unicode) = 0; + _PyUnicode_KIND(unicode) = kind; + _PyUnicode_COMPACT(unicode) = 1; + _PyUnicode_ASCII(unicode) = is_ascii; if (is_ascii) { ((char*)data)[size] = 0; } @@ -14372,10 +14375,10 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) #else _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); #endif - _PyUnicode_STATE(self).interned = 0; - _PyUnicode_STATE(self).kind = kind; - _PyUnicode_STATE(self).compact = 0; - _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; + _PyUnicode_INTERNED(self) = 0; + _PyUnicode_KIND(self) = kind; + _PyUnicode_COMPACT(self) = 0; + _PyUnicode_ASCII(self) = _PyUnicode_ASCII(unicode); _PyUnicode_UTF8_LENGTH(self) = 0; _PyUnicode_UTF8(self) = NULL; _PyUnicode_DATA_ANY(self) = NULL; @@ -14624,7 +14627,7 @@ PyUnicode_InternInPlace(PyObject **p) refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */ Py_SET_REFCNT(s, Py_REFCNT(s) - 2); - _PyUnicode_STATE(s).interned = 1; + _PyUnicode_INTERNED(s) = 1; } // Function kept for the stable ABI. @@ -14683,7 +14686,7 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp) total_length += PyUnicode_GET_LENGTH(s); #endif - _PyUnicode_STATE(s).interned = 0; + _PyUnicode_INTERNED(s) = 0; } #ifdef INTERNED_STATS fprintf(stderr, diff --git a/Python/traceback.c b/Python/traceback.c index 31b85e77575efae..e6417516d47dbf5 100644 --- a/Python/traceback.c +++ b/Python/traceback.c @@ -1092,9 +1092,9 @@ _Py_DumpASCII(int fd, PyObject *text) return; size = ascii->length; - kind = ascii->state.kind; - if (ascii->state.compact) { - if (ascii->state.ascii) + kind = ascii->kind; + if (ascii->compact) { + if (ascii->ascii) data = ascii + 1; else data = _PyCompactUnicodeObject_CAST(text) + 1; @@ -1114,7 +1114,7 @@ _Py_DumpASCII(int fd, PyObject *text) } // Is an ASCII string? - if (ascii->state.ascii) { + if (ascii->ascii) { assert(kind == PyUnicode_1BYTE_KIND); char *str = data; @@ -1341,4 +1341,3 @@ _Py_DumpTracebackThreads(int fd, PyInterpreterState *interp, return NULL; } - diff --git a/Tools/build/deepfreeze.py b/Tools/build/deepfreeze.py index 511b26a5ce3dc73..c855e405d781ded 100644 --- a/Tools/build/deepfreeze.py +++ b/Tools/build/deepfreeze.py @@ -198,10 +198,9 @@ def generate_unicode(self, name: str, s: str) -> str: self.object_head("PyUnicode_Type") self.write(f".length = {len(s)},") self.write(".hash = -1,") - with self.block(".state =", ","): - self.write(".kind = 1,") - self.write(".compact = 1,") - self.write(".ascii = 1,") + self.write(".kind = 1,") + self.write(".compact = 1,") + self.write(".ascii = 1,") self.write(f"._data = {make_string_literal(s.encode('ascii'))},") return f"& {name}._ascii.ob_base" else: @@ -210,10 +209,9 @@ def generate_unicode(self, name: str, s: str) -> str: self.object_head("PyUnicode_Type") self.write(f".length = {len(s)},") self.write(".hash = -1,") - with self.block(".state =", ","): - self.write(f".kind = {kind},") - self.write(".compact = 1,") - self.write(".ascii = 0,") + self.write(f".kind = {kind},") + self.write(".compact = 1,") + self.write(".ascii = 0,") utf8 = s.encode('utf-8') self.write(f'.utf8 = {make_string_literal(utf8)},') self.write(f'.utf8_length = {len(utf8)},')