Skip to content

Commit

Permalink
pythongh-89188: replace bitfield with struct fields in PyASCIIObject
Browse files Browse the repository at this point in the history
  • Loading branch information
davidhewitt committed Mar 10, 2023
1 parent 12226be commit 14c79d4
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 93 deletions.
91 changes: 40 additions & 51 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,46 +97,41 @@ typedef struct {
PyObject_HEAD
Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */
struct {
/* If interned is set, the two references from the
dictionary to this object are *not* counted in ob_refcnt. */
unsigned int interned:1;
/* Character size:
- PyUnicode_1BYTE_KIND (1):
* character type = Py_UCS1 (8 bits, unsigned)
* all characters are in the range U+0000-U+00FF (latin1)
* if ascii is set, all characters are in the range U+0000-U+007F
(ASCII), otherwise at least one character is in the range
U+0080-U+00FF
- PyUnicode_2BYTE_KIND (2):
* character type = Py_UCS2 (16 bits, unsigned)
* all characters are in the range U+0000-U+FFFF (BMP)
* at least one character is in the range U+0100-U+FFFF
- PyUnicode_4BYTE_KIND (4):
* character type = Py_UCS4 (32 bits, unsigned)
* all characters are in the range U+0000-U+10FFFF
* at least one character is in the range U+10000-U+10FFFF
*/
unsigned int kind:3;
/* Compact is with respect to the allocation scheme. Compact unicode
objects only require one memory block while non-compact objects use
one block for the PyUnicodeObject struct and another for its data
buffer. */
unsigned int compact:1;
/* The string only contains characters in the range U+0000-U+007F (ASCII)
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
unsigned int ascii:1;
/* Padding to ensure that PyUnicode_DATA() is always aligned to
4 bytes (see issue #19537 on m68k). */
unsigned int :26;
} state;
/* If interned is set, the two references from the
dictionary to this object are *not* counted in ob_refcnt. */
uint8_t interned;
/* Character size:
- PyUnicode_1BYTE_KIND (1):
* character type = Py_UCS1 (8 bits, unsigned)
* all characters are in the range U+0000-U+00FF (latin1)
* if ascii is set, all characters are in the range U+0000-U+007F
(ASCII), otherwise at least one character is in the range
U+0080-U+00FF
- PyUnicode_2BYTE_KIND (2):
* character type = Py_UCS2 (16 bits, unsigned)
* all characters are in the range U+0000-U+FFFF (BMP)
* at least one character is in the range U+0100-U+FFFF
- PyUnicode_4BYTE_KIND (4):
* character type = Py_UCS4 (32 bits, unsigned)
* all characters are in the range U+0000-U+10FFFF
* at least one character is in the range U+10000-U+10FFFF
*/
uint8_t kind;
/* Compact is with respect to the allocation scheme. Compact unicode
objects only require one memory block while non-compact objects use
one block for the PyUnicodeObject struct and another for its data
buffer. */
uint8_t compact;
/* The string only contains characters in the range U+0000-U+007F (ASCII)
and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
set, use the PyASCIIObject structure. */
uint8_t ascii;
} PyASCIIObject;

/* Non-ASCII strings allocated through PyUnicode_New use the
Expand Down Expand Up @@ -178,15 +173,9 @@ PyAPI_FUNC(int) _PyUnicode_CheckConsistency(

/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */

/* Values for PyASCIIObject.state: */

/* Interning state. */
#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1

/* Use only if you know it's a string */
static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
return _PyASCIIObject_CAST(op)->state.interned;
return _PyASCIIObject_CAST(op)->interned;
}
#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))

Expand All @@ -200,21 +189,21 @@ static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
ready. */
static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
return _PyASCIIObject_CAST(op)->state.ascii;
return _PyASCIIObject_CAST(op)->ascii;
}
#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))

/* Return true if the string is compact or 0 if not.
No type checks or Ready calls are performed. */
static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
return _PyASCIIObject_CAST(op)->state.compact;
return _PyASCIIObject_CAST(op)->compact;
}
#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))

/* Return true if the string is a compact ASCII string (use PyASCIIObject
structure), or 0 if not. No type checks or Ready calls are performed. */
static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
return (_PyASCIIObject_CAST(op)->ascii && PyUnicode_IS_COMPACT(op));
}
#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))

Expand All @@ -231,7 +220,7 @@ enum PyUnicode_Kind {
// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
// unsigned numbers) where kind type is an int or on
// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->kind)

/* Return a void pointer to the raw unicode buffer. */
static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
Expand Down
8 changes: 3 additions & 5 deletions Include/internal/pycore_runtime_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,9 @@ extern PyTypeObject _PyExc_MemoryError;
.ob_base = _PyObject_IMMORTAL_INIT(&PyUnicode_Type), \
.length = sizeof(LITERAL) - 1, \
.hash = -1, \
.state = { \
.kind = 1, \
.compact = 1, \
.ascii = (ASCII), \
}, \
.kind = 1, \
.compact = 1, \
.ascii = (ASCII), \
}
#define _PyASCIIObject_INIT(LITERAL) \
{ \
Expand Down
6 changes: 6 additions & 0 deletions Lib/test/test_capi/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1555,5 +1555,11 @@ def func2(x=None):
self.do_test(func2)


class Test_UnicodeObjectAlignment(unittest.TestCase):

def test_unicodeobject_data_alignment(self):
_testinternalcapi.check_compactunicodeobject_data_alignment()


if __name__ == "__main__":
unittest.main()
16 changes: 16 additions & 0 deletions Modules/_testinternalcapi.c
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,21 @@ clear_extension(PyObject *self, PyObject *args)
Py_RETURN_NONE;
}

static PyObject *
check_compactunicodeobject_data_alignment()
{
size_t data_offset = sizeof(PyCompactUnicodeObject);
if (data_offset % 4 != 0) {
// This is required so that the data (which immediately follows a
// compact unicode offset) is correctly aligned in the largest case (UCS_4)
PyErr_Format(PyExc_AssertionError,
"PyCompactUnicodeObject size offset is %i, needs to be multiple of 4 bytes",
data_offset);
return NULL;
}
Py_RETURN_NONE;
}


static PyMethodDef module_functions[] = {
{"get_configs", get_configs, METH_NOARGS},
Expand All @@ -707,6 +722,7 @@ static PyMethodDef module_functions[] = {
_TESTINTERNALCAPI_OPTIMIZE_CFG_METHODDEF
{"get_interp_settings", get_interp_settings, METH_VARARGS, NULL},
{"clear_extension", clear_extension, METH_VARARGS, NULL},
{"check_compactunicodeobject_data_alignment", check_compactunicodeobject_data_alignment, METH_NOARGS, NULL},
{NULL, NULL} /* sentinel */
};

Expand Down
51 changes: 27 additions & 24 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,16 @@ extern "C" {

#define _PyUnicode_LENGTH(op) \
(_PyASCIIObject_CAST(op)->length)
#define _PyUnicode_STATE(op) \
(_PyASCIIObject_CAST(op)->state)
#define _PyUnicode_HASH(op) \
(_PyASCIIObject_CAST(op)->hash)
#define _PyUnicode_INTERNED(op) \
(_PyASCIIObject_CAST(op)->interned)
#define _PyUnicode_KIND(op) \
(assert(_PyUnicode_CHECK(op)), \
_PyASCIIObject_CAST(op)->state.kind)
(_PyASCIIObject_CAST(op)->kind)
#define _PyUnicode_COMPACT(op) \
(_PyASCIIObject_CAST(op)->compact)
#define _PyUnicode_ASCII(op) \
(_PyASCIIObject_CAST(op)->ascii)
#define _PyUnicode_GET_LENGTH(op) \
(assert(_PyUnicode_CHECK(op)), \
_PyASCIIObject_CAST(op)->length)
Expand Down Expand Up @@ -497,21 +500,21 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
CHECK(PyUnicode_Check(op));

PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
int kind = ascii->state.kind;
int kind = ascii->kind;

if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
if (ascii->ascii == 1 && ascii->compact == 1) {
CHECK(kind == PyUnicode_1BYTE_KIND);
}
else {
PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
void *data;

if (ascii->state.compact == 1) {
if (ascii->compact == 1) {
data = compact + 1;
CHECK(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
CHECK(ascii->state.ascii == 0);
CHECK(ascii->ascii == 0);
CHECK(compact->utf8 != data);
}
else {
Expand All @@ -521,9 +524,9 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
CHECK(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
CHECK(ascii->state.compact == 0);
CHECK(ascii->compact == 0);
CHECK(data != NULL);
if (ascii->state.ascii) {
if (ascii->ascii) {
CHECK(compact->utf8 == data);
CHECK(compact->utf8_length == ascii->length);
}
Expand Down Expand Up @@ -551,7 +554,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
maxchar = ch;
}
if (kind == PyUnicode_1BYTE_KIND) {
if (ascii->state.ascii == 0) {
if (ascii->ascii == 0) {
CHECK(maxchar >= 128);
CHECK(maxchar <= 255);
}
Expand Down Expand Up @@ -1108,9 +1111,9 @@ _PyUnicode_Dump(PyObject *op)
PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
const void *data;

if (ascii->state.compact)
if (ascii->compact)
{
if (ascii->state.ascii)
if (ascii->ascii)
data = (ascii + 1);
else
data = (compact + 1);
Expand All @@ -1119,7 +1122,7 @@ _PyUnicode_Dump(PyObject *op)
data = unicode->data.any;
printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);

if (!ascii->state.ascii) {
if (!ascii->ascii) {
printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
}
printf(", data=%p\n", data);
Expand Down Expand Up @@ -1195,10 +1198,10 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
data = unicode + 1;
_PyUnicode_LENGTH(unicode) = size;
_PyUnicode_HASH(unicode) = -1;
_PyUnicode_STATE(unicode).interned = 0;
_PyUnicode_STATE(unicode).kind = kind;
_PyUnicode_STATE(unicode).compact = 1;
_PyUnicode_STATE(unicode).ascii = is_ascii;
_PyUnicode_INTERNED(unicode) = 0;
_PyUnicode_KIND(unicode) = kind;
_PyUnicode_COMPACT(unicode) = 1;
_PyUnicode_ASCII(unicode) = is_ascii;
if (is_ascii) {
((char*)data)[size] = 0;
}
Expand Down Expand Up @@ -14372,10 +14375,10 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
#else
_PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
#endif
_PyUnicode_STATE(self).interned = 0;
_PyUnicode_STATE(self).kind = kind;
_PyUnicode_STATE(self).compact = 0;
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
_PyUnicode_INTERNED(self) = 0;
_PyUnicode_KIND(self) = kind;
_PyUnicode_COMPACT(self) = 0;
_PyUnicode_ASCII(self) = _PyUnicode_ASCII(unicode);
_PyUnicode_UTF8_LENGTH(self) = 0;
_PyUnicode_UTF8(self) = NULL;
_PyUnicode_DATA_ANY(self) = NULL;
Expand Down Expand Up @@ -14624,7 +14627,7 @@ PyUnicode_InternInPlace(PyObject **p)
refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
this. */
Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
_PyUnicode_STATE(s).interned = 1;
_PyUnicode_INTERNED(s) = 1;
}

// Function kept for the stable ABI.
Expand Down Expand Up @@ -14683,7 +14686,7 @@ _PyUnicode_ClearInterned(PyInterpreterState *interp)
total_length += PyUnicode_GET_LENGTH(s);
#endif

_PyUnicode_STATE(s).interned = 0;
_PyUnicode_INTERNED(s) = 0;
}
#ifdef INTERNED_STATS
fprintf(stderr,
Expand Down
9 changes: 4 additions & 5 deletions Python/traceback.c
Original file line number Diff line number Diff line change
Expand Up @@ -1092,9 +1092,9 @@ _Py_DumpASCII(int fd, PyObject *text)
return;

size = ascii->length;
kind = ascii->state.kind;
if (ascii->state.compact) {
if (ascii->state.ascii)
kind = ascii->kind;
if (ascii->compact) {
if (ascii->ascii)
data = ascii + 1;
else
data = _PyCompactUnicodeObject_CAST(text) + 1;
Expand All @@ -1114,7 +1114,7 @@ _Py_DumpASCII(int fd, PyObject *text)
}

// Is an ASCII string?
if (ascii->state.ascii) {
if (ascii->ascii) {
assert(kind == PyUnicode_1BYTE_KIND);
char *str = data;

Expand Down Expand Up @@ -1341,4 +1341,3 @@ _Py_DumpTracebackThreads(int fd, PyInterpreterState *interp,

return NULL;
}

14 changes: 6 additions & 8 deletions Tools/build/deepfreeze.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,9 @@ def generate_unicode(self, name: str, s: str) -> str:
self.object_head("PyUnicode_Type")
self.write(f".length = {len(s)},")
self.write(".hash = -1,")
with self.block(".state =", ","):
self.write(".kind = 1,")
self.write(".compact = 1,")
self.write(".ascii = 1,")
self.write(".kind = 1,")
self.write(".compact = 1,")
self.write(".ascii = 1,")
self.write(f"._data = {make_string_literal(s.encode('ascii'))},")
return f"& {name}._ascii.ob_base"
else:
Expand All @@ -210,10 +209,9 @@ def generate_unicode(self, name: str, s: str) -> str:
self.object_head("PyUnicode_Type")
self.write(f".length = {len(s)},")
self.write(".hash = -1,")
with self.block(".state =", ","):
self.write(f".kind = {kind},")
self.write(".compact = 1,")
self.write(".ascii = 0,")
self.write(f".kind = {kind},")
self.write(".compact = 1,")
self.write(".ascii = 0,")
utf8 = s.encode('utf-8')
self.write(f'.utf8 = {make_string_literal(utf8)},')
self.write(f'.utf8_length = {len(utf8)},')
Expand Down

0 comments on commit 14c79d4

Please sign in to comment.