Skip to content

Commit

Permalink
unicode: make unicodeobject.c thread-safe
Browse files Browse the repository at this point in the history
 - Use atomic operations to initialize _Py_Identifiers
 - Create interpreter interned dict at interpreter startup
  • Loading branch information
colesbury committed Apr 23, 2023
1 parent 410ba10 commit 6540bf3
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 104 deletions.
7 changes: 3 additions & 4 deletions Include/cpython/object.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,15 @@ PyAPI_FUNC(Py_ssize_t) _Py_GetRefTotal(void);
*/
typedef struct _Py_Identifier {
const char* string;
// Index in PyInterpreterState.unicode.ids.array. It is process-wide
// unique and must be initialized to -1.
Py_ssize_t index;
PyObject *obj;
struct _Py_Identifier *next;
} _Py_Identifier;

#ifndef Py_BUILD_CORE
// For now we are keeping _Py_IDENTIFIER for continued use
// in non-builtin extensions (and naughty PyPI modules).

#define _Py_static_string_init(value) { .string = (value), .index = -1 }
#define _Py_static_string_init(value) { .string = (value), .obj = NULL, .next = NULL }
#define _Py_static_string(varname, value) static _Py_Identifier varname = _Py_static_string_init(value)
#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)

Expand Down
5 changes: 5 additions & 0 deletions Include/internal/pycore_qsbr.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ struct qsbr_pad {
char __padding[64 - sizeof(struct qsbr)];
};

struct _Py_qsbr_head {
struct _Py_qsbr_head *next;
uint64_t seq;
};

static inline uint64_t
_Py_qsbr_shared_current(struct qsbr_shared *shared)
{
Expand Down
18 changes: 2 additions & 16 deletions Include/internal/pycore_unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,9 @@ extern PyTypeObject _PyUnicodeASCIIIter_Type;

/* other API */

struct _Py_unicode_runtime_ids {
_PyMutex mutex;
// next_index value must be preserved when Py_Initialize()/Py_Finalize()
// is called multiple times: see _PyUnicode_FromId() implementation.
Py_ssize_t next_index;
};

struct _Py_unicode_runtime_state {
struct _Py_unicode_runtime_ids ids;
// linked list of initialized _Py_Identifiers
_Py_Identifier *head;
};

/* fs_codec.encoding is initialized to NULL.
Expand All @@ -45,18 +39,10 @@ struct _Py_unicode_fs_codec {
_Py_error_handler error_handler;
};

struct _Py_unicode_ids {
Py_ssize_t size;
PyObject **array;
};

struct _Py_unicode_state {
struct _Py_unicode_fs_codec fs_codec;

_PyUnicode_Name_CAPI *ucnhash_capi;

// Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
struct _Py_unicode_ids ids;
};

extern void _PyUnicode_ClearInterned(PyInterpreterState *interp);
Expand Down
121 changes: 46 additions & 75 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,7 @@ static inline PyObject* unicode_get_empty(void)
// Return a strong reference to the empty string singleton.
static inline PyObject* unicode_new_empty(void)
{
PyObject *empty = unicode_get_empty();
return Py_NewRef(empty);
return unicode_get_empty();
}

/* This dictionary holds all interned unicode strings. Note that references
Expand Down Expand Up @@ -1706,7 +1705,7 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
static PyObject*
get_latin1_char(Py_UCS1 ch)
{
return Py_NewRef(LATIN1(ch));
return LATIN1(ch);
}

static PyObject*
Expand Down Expand Up @@ -1863,67 +1862,41 @@ resize_array(PyObject **array, Py_ssize_t *capacity)
return new_array;
}

PyObject *
_PyUnicode_FromId(_Py_Identifier *id)
static PyObject *
initialize_identifier(_Py_Identifier *id)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
struct _Py_unicode_ids *ids = &interp->unicode.ids;

Py_ssize_t index = _Py_atomic_size_get(&id->index);
if (index < 0) {
struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;

_PyMutex_lock(&rt_ids->mutex);
// Check again to detect concurrent access. Another thread can have
// initialized the index while this thread waited for the lock.
index = _Py_atomic_size_get(&id->index);
if (index < 0) {
assert(rt_ids->next_index < PY_SSIZE_T_MAX);
index = rt_ids->next_index;
rt_ids->next_index++;
_Py_atomic_size_set(&id->index, index);
}
_PyMutex_unlock(&rt_ids->mutex);
}
assert(index >= 0);

PyObject *obj;
if (index < ids->size) {
obj = ids->array[index];
if (obj) {
// Return a borrowed reference
return obj;
}
}

obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
NULL, NULL);
PyObject *obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
NULL, NULL);
if (!obj) {
return NULL;
}
PyUnicode_InternInPlace(&obj);

if (index >= ids->size) {
// Overallocate to reduce the number of realloc
Py_ssize_t new_size = Py_MAX(index * 2, 16);
Py_ssize_t item_size = sizeof(ids->array[0]);
PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
if (new_array == NULL) {
PyErr_NoMemory();
return NULL;
assert(_PyObject_IS_IMMORTAL(obj));

if (!_Py_atomic_compare_exchange_ptr(&id->obj, NULL, obj)) {
Py_DECREF(obj);
return _Py_atomic_load_ptr(&id->obj);
}
for (;;) {
id->next = _Py_atomic_load_ptr(&_PyRuntime.unicode_state.head);
if (_Py_atomic_compare_exchange_ptr(&_PyRuntime.unicode_state.head, id->next, id)) {
break;
}
memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
ids->array = new_array;
ids->size = new_size;
}

// The array stores a strong reference
ids->array[index] = obj;

// Return a borrowed reference
return obj;
}

PyObject *
_PyUnicode_FromId(_Py_Identifier *id)
{
PyObject *obj = _Py_atomic_load_ptr(&id->obj);
if (obj) {
return obj;
}
return initialize_identifier(id);
}

static void
_PyUnicode_Immortalize(PyObject *obj)
{
Expand Down Expand Up @@ -1956,17 +1929,16 @@ _PyUnicode_Immortalize(PyObject *obj)


static void
unicode_clear_identifiers(struct _Py_unicode_state *state)
unicode_clear_identifiers(struct _Py_unicode_runtime_state *state)
{
struct _Py_unicode_ids *ids = &state->ids;
for (Py_ssize_t i=0; i < ids->size; i++) {
Py_XDECREF(ids->array[i]);
_Py_Identifier *id = state->head;
while (id) {
_Py_Identifier *next = id->next;
id->next = NULL;
id->obj = NULL;
id = next;
}
ids->size = 0;
PyMem_Free(ids->array);
ids->array = NULL;
// Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
// after Py_Finalize().
state->head = NULL;
}

static void
Expand Down Expand Up @@ -14593,6 +14565,16 @@ _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
return _PyStatus_OK();
}

/* Create the interned dictionary. This must be done before creating static
* strings.
*/
assert(get_interned_dict() == NULL);
PyObject *dict = PyDict_New();
if (!dict) {
return _PyStatus_NO_MEMORY();
}
set_interned_dict(dict);

/* Intern statically allocated string identifiers and deepfreeze strings.
* This must be done before any module initialization so that statically
* allocated string identifiers are used instead of heap allocated strings.
Expand Down Expand Up @@ -14660,14 +14642,6 @@ PyUnicode_InternInPlace(PyObject **p)
}

PyObject *interned = get_interned_dict();
if (interned == NULL) {
interned = PyDict_New();
if (interned == NULL) {
PyErr_Clear(); /* Don't leave an exception */
return;
}
set_interned_dict(interned);
}

if (!_Py_ThreadLocal(s) && !_PyObject_IS_IMMORTAL(s)) {
/* Make a copy so that we can safely immortalize the string. */
Expand Down Expand Up @@ -15189,21 +15163,18 @@ _PyUnicode_FiniTypes(PyInterpreterState *interp)
void
_PyUnicode_Fini(PyInterpreterState *interp)
{
struct _Py_unicode_state *state = &interp->unicode;

if (_Py_IsMainInterpreter(interp)) {
// _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
assert(get_interned_dict() == NULL);
// bpo-47182: force a unicodedata CAPI capsule re-import on
// subsequent initialization of main interpreter.
}

_PyUnicode_FiniEncodings(&state->fs_codec);
_PyUnicode_FiniEncodings(&interp->unicode.fs_codec);
interp->unicode.ucnhash_capi = NULL;

unicode_clear_identifiers(state);

if (_Py_IsMainInterpreter(interp)) {
unicode_clear_identifiers(&_PyRuntime.unicode_state);
unicode_free_immortalized(&_PyRuntime);
}
}
Expand Down
1 change: 0 additions & 1 deletion Programs/_testembed.c
Original file line number Diff line number Diff line change
Expand Up @@ -1896,7 +1896,6 @@ static int test_unicode_id_init(void)
// is defined, it is manually expanded here.
static _Py_Identifier PyId_test_unicode_id_init = {
.string = "test_unicode_id_init",
.index = -1,
};

// Initialize Python once without using the identifier
Expand Down
10 changes: 2 additions & 8 deletions Python/pystate.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ Py_DECL_THREAD PyThreadState *_Py_current_tstate;
static void
init_runtime(_PyRuntimeState *runtime,
void *open_code_hook, void *open_code_userdata,
_Py_AuditHookEntry *audit_hook_head,
Py_ssize_t unicode_next_index)
_Py_AuditHookEntry *audit_hook_head)
{
if (runtime->_initialized) {
Py_FatalError("runtime already initialized");
Expand All @@ -95,9 +94,6 @@ init_runtime(_PyRuntimeState *runtime,

// Set it to the ID of the main thread of the main interpreter.
runtime->main_thread = PyThread_get_thread_ident();

runtime->unicode_state.ids.next_index = unicode_next_index;

runtime->_initialized = 1;
}

Expand All @@ -112,15 +108,13 @@ _PyRuntimeState_Init(_PyRuntimeState *runtime)
_Py_AuditHookEntry *audit_hook_head = runtime->audit_hook_head;
// bpo-42882: Preserve next_index value if Py_Initialize()/Py_Finalize()
// is called multiple times.
Py_ssize_t unicode_next_index = runtime->unicode_state.ids.next_index;

if (runtime->_initialized) {
// Py_Initialize() must be running again.
// Reset to _PyRuntimeState_INIT.
memcpy(runtime, &initial, sizeof(*runtime));
}
init_runtime(runtime, open_code_hook, open_code_userdata, audit_hook_head,
unicode_next_index);
init_runtime(runtime, open_code_hook, open_code_userdata, audit_hook_head);

return _PyStatus_OK();
}
Expand Down

0 comments on commit 6540bf3

Please sign in to comment.