Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-116738: Make _codecs module thread-safe #117530

Merged
merged 7 commits into from
May 2, 2024
Merged
31 changes: 31 additions & 0 deletions Include/internal/pycore_codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ extern "C" {
# error "this header requires Py_BUILD_CORE define"
#endif

#include "pycore_lock.h" // PyMutex

/* Initialize codecs-related state for the given interpreter, including
registering the first codec search function. Must be called before any other
PyCodec-related functions, and while only one thread is active. */
extern PyStatus _PyCodec_InitRegistry(PyInterpreterState *interp);

/* Finalize codecs-related state for the given interpreter. No PyCodec-related
functions other than PyCodec_Unregister() may be called after this. */
extern void _PyCodec_Fini(PyInterpreterState *interp);

extern PyObject* _PyCodec_Lookup(const char *encoding);

/* Text codec specific encoding and decoding API.
Expand Down Expand Up @@ -48,6 +59,26 @@ extern PyObject* _PyCodecInfo_GetIncrementalEncoder(
PyObject *codec_info,
const char *errors);

// Per-interpreter state used by codecs.c.
struct codecs_state {
// A list of callable objects used to search for codecs.
PyObject *search_path;

// A dict mapping codec names to codecs returned from a callable in
// search_path.
PyObject *search_cache;

// A dict mapping error handling strategies to functions to implement them.
PyObject *error_registry;

#ifdef Py_GIL_DISABLED
// Used to safely delete a specific item from search_path.
PyMutex search_path_mutex;
#endif

// Whether or not the rest of the state is initialized.
int initialized;
};

#ifdef __cplusplus
}
Expand Down
6 changes: 2 additions & 4 deletions Include/internal/pycore_interp.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ extern "C" {
#include "pycore_atexit.h" // struct atexit_state
#include "pycore_ceval_state.h" // struct _ceval_state
#include "pycore_code.h" // struct callable_cache
#include "pycore_codecs.h" // struct codecs_state
#include "pycore_context.h" // struct _Py_context_state
#include "pycore_crossinterp.h" // struct _xidregistry
#include "pycore_dict_state.h" // struct _Py_dict_state
Expand Down Expand Up @@ -182,10 +183,7 @@ struct _is {
possible to facilitate out-of-process observability
tools. */

PyObject *codec_search_path;
PyObject *codec_search_cache;
PyObject *codec_error_registry;
int codecs_initialized;
struct codecs_state codecs;

PyConfig config;
unsigned long feature_flags;
Expand Down
6 changes: 5 additions & 1 deletion Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -15441,7 +15441,11 @@ init_fs_encoding(PyThreadState *tstate)
PyStatus
_PyUnicode_InitEncodings(PyThreadState *tstate)
{
PyStatus status = init_fs_encoding(tstate);
PyStatus status = _PyCodec_InitRegistry(tstate->interp);
if (_PyStatus_EXCEPTION(status)) {
return status;
}
status = init_fs_encoding(tstate);
if (_PyStatus_EXCEPTION(status)) {
return status;
}
Expand Down
150 changes: 80 additions & 70 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Copyright (c) Corporation for National Research Initiatives.
#include "Python.h"
#include "pycore_call.h" // _PyObject_CallNoArgs()
#include "pycore_interp.h" // PyInterpreterState.codec_search_path
#include "pycore_lock.h" // PyMutex
#include "pycore_pyerrors.h" // _PyErr_FormatNote()
#include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
swtaarrs marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -19,24 +20,10 @@ const char *Py_hexdigits = "0123456789abcdef";

/* --- Codec Registry ----------------------------------------------------- */

/* Import the standard encodings package which will register the first
codec search function.

This is done in a lazy way so that the Unicode implementation does
not downgrade startup time of scripts not needing it.

ImportErrors are silently ignored by this function. Only one try is
made.

*/

static int _PyCodecRegistry_Init(void); /* Forward */

int PyCodec_Register(PyObject *search_function)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
goto onError;
assert(interp->codecs.initialized);
if (search_function == NULL) {
PyErr_BadArgument();
goto onError;
Expand All @@ -45,7 +32,14 @@ int PyCodec_Register(PyObject *search_function)
PyErr_SetString(PyExc_TypeError, "argument must be callable");
goto onError;
}
return PyList_Append(interp->codec_search_path, search_function);
#ifdef Py_GIL_DISABLED
PyMutex_Lock(&interp->codecs.search_path_mutex);
#endif
int ret = PyList_Append(interp->codecs.search_path, search_function);
#ifdef Py_GIL_DISABLED
PyMutex_Unlock(&interp->codecs.search_path_mutex);
#endif
return ret;

onError:
return -1;
Expand All @@ -55,22 +49,34 @@ int
PyCodec_Unregister(PyObject *search_function)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
PyObject *codec_search_path = interp->codec_search_path;
/* Do nothing if codec_search_path is not created yet or was cleared. */
if (codec_search_path == NULL) {
if (interp->codecs.initialized != 1) {
/* Do nothing if codecs state was cleared (only possible during
interpreter shutdown). */
return 0;
}

PyObject *codec_search_path = interp->codecs.search_path;
assert(PyList_CheckExact(codec_search_path));
Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
for (Py_ssize_t i = 0; i < n; i++) {
PyObject *item = PyList_GET_ITEM(codec_search_path, i);
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
#ifdef Py_GIL_DISABLED
PyMutex_Lock(&interp->codecs.search_path_mutex);
#endif
PyObject *item = PyList_GetItemRef(codec_search_path, i);
int ret = 1;
if (item == search_function) {
if (interp->codec_search_cache != NULL) {
assert(PyDict_CheckExact(interp->codec_search_cache));
PyDict_Clear(interp->codec_search_cache);
}
return PyList_SetSlice(codec_search_path, i, i+1, NULL);
// We hold a reference to the item, so its destructor can't run
// while we hold search_path_mutex.
ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
}
#ifdef Py_GIL_DISABLED
PyMutex_Unlock(&interp->codecs.search_path_mutex);
#endif
Py_DECREF(item);
if (ret != 1) {
assert(interp->codecs.search_cache != NULL);
assert(PyDict_CheckExact(interp->codecs.search_cache));
PyDict_Clear(interp->codecs.search_cache);
return ret;
}
}
return 0;
Expand Down Expand Up @@ -132,9 +138,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
}

PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
return NULL;
}
assert(interp->codecs.initialized);

/* Convert the encoding to a normalized Python string: all
characters are converted to lower case, spaces and hyphens are
Expand All @@ -147,7 +151,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)

/* First, try to lookup the name in the registry dictionary */
PyObject *result;
if (PyDict_GetItemRef(interp->codec_search_cache, v, &result) < 0) {
if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
goto onError;
}
if (result != NULL) {
Expand All @@ -156,7 +160,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
}

/* Next, scan the search functions in order of registration */
const Py_ssize_t len = PyList_Size(interp->codec_search_path);
const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
if (len < 0)
goto onError;
if (len == 0) {
Expand All @@ -170,14 +174,15 @@ PyObject *_PyCodec_Lookup(const char *encoding)
for (i = 0; i < len; i++) {
PyObject *func;

func = PyList_GetItem(interp->codec_search_path, i);
func = PyList_GetItemRef(interp->codecs.search_path, i);
if (func == NULL)
goto onError;
result = PyObject_CallOneArg(func, v);
Py_DECREF(func);
if (result == NULL)
goto onError;
if (result == Py_None) {
Py_DECREF(result);
Py_CLEAR(result);
continue;
}
if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
Expand All @@ -188,15 +193,15 @@ PyObject *_PyCodec_Lookup(const char *encoding)
}
break;
}
if (i == len) {
if (result == NULL) {
/* XXX Perhaps we should cache misses too ? */
PyErr_Format(PyExc_LookupError,
"unknown encoding: %s", encoding);
goto onError;
}

/* Cache and return the result */
if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
Py_DECREF(result);
goto onError;
}
Expand Down Expand Up @@ -600,13 +605,12 @@ PyObject *_PyCodec_DecodeText(PyObject *object,
int PyCodec_RegisterError(const char *name, PyObject *error)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
return -1;
assert(interp->codecs.initialized);
if (!PyCallable_Check(error)) {
PyErr_SetString(PyExc_TypeError, "handler must be callable");
return -1;
}
return PyDict_SetItemString(interp->codec_error_registry,
return PyDict_SetItemString(interp->codecs.error_registry,
name, error);
}

Expand All @@ -616,13 +620,12 @@ int PyCodec_RegisterError(const char *name, PyObject *error)
PyObject *PyCodec_LookupError(const char *name)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
return NULL;
assert(interp->codecs.initialized);

if (name==NULL)
name = "strict";
PyObject *handler;
if (PyDict_GetItemStringRef(interp->codec_error_registry, name, &handler) < 0) {
if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
return NULL;
}
if (handler == NULL) {
Expand Down Expand Up @@ -1375,7 +1378,8 @@ static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
return PyCodec_SurrogateEscapeErrors(exc);
}

static int _PyCodecRegistry_Init(void)
PyStatus
_PyCodec_InitRegistry(PyInterpreterState *interp)
{
static struct {
const char *name;
Expand Down Expand Up @@ -1463,45 +1467,51 @@ static int _PyCodecRegistry_Init(void)
}
};

PyInterpreterState *interp = _PyInterpreterState_GET();
PyObject *mod;

if (interp->codec_search_path != NULL)
return 0;

interp->codec_search_path = PyList_New(0);
if (interp->codec_search_path == NULL) {
return -1;
assert(interp->codecs.initialized == 0);
interp->codecs.search_path = PyList_New(0);
if (interp->codecs.search_path == NULL) {
return PyStatus_NoMemory();
}

interp->codec_search_cache = PyDict_New();
if (interp->codec_search_cache == NULL) {
return -1;
interp->codecs.search_cache = PyDict_New();
if (interp->codecs.search_cache == NULL) {
return PyStatus_NoMemory();
}

interp->codec_error_registry = PyDict_New();
if (interp->codec_error_registry == NULL) {
return -1;
interp->codecs.error_registry = PyDict_New();
if (interp->codecs.error_registry == NULL) {
return PyStatus_NoMemory();
}

for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
if (!func) {
return -1;
if (func == NULL) {
return PyStatus_NoMemory();
}

int res = PyCodec_RegisterError(methods[i].name, func);
int res = PyDict_SetItemString(interp->codecs.error_registry,
methods[i].name, func);
Py_DECREF(func);
if (res) {
return -1;
if (res < 0) {
return PyStatus_Error("Failed to insert into codec error registry");
}
}

mod = PyImport_ImportModule("encodings");
interp->codecs.initialized = 1;

// Importing `encodings' will call back into this module to register codec
// search functions, so this is done after everything else is initialized.
PyObject *mod = PyImport_ImportModule("encodings");
if (mod == NULL) {
return -1;
return PyStatus_Error("Failed to import encodings module");
}
Py_DECREF(mod);
interp->codecs_initialized = 1;
return 0;

return PyStatus_Ok();
}

void
_PyCodec_Fini(PyInterpreterState *interp)
{
Py_CLEAR(interp->codecs.search_path);
Py_CLEAR(interp->codecs.search_cache);
Py_CLEAR(interp->codecs.error_registry);
interp->codecs.initialized = 0;
}
4 changes: 1 addition & 3 deletions Python/pystate.c
Original file line number Diff line number Diff line change
Expand Up @@ -843,9 +843,7 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
}

PyConfig_Clear(&interp->config);
Py_CLEAR(interp->codec_search_path);
Py_CLEAR(interp->codec_search_cache);
Py_CLEAR(interp->codec_error_registry);
_PyCodec_Fini(interp);

assert(interp->imports.modules == NULL);
assert(interp->imports.modules_by_index == NULL);
Expand Down
2 changes: 1 addition & 1 deletion Tools/c-analyzer/cpython/ignored.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ Python/ceval.c - _PyEval_BinaryOps -
Python/ceval.c - _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS -
Python/codecs.c - Py_hexdigits -
Python/codecs.c - ucnhash_capi -
Python/codecs.c _PyCodecRegistry_Init methods -
Python/codecs.c _PyCodec_InitRegistry methods -
Python/compile.c - NO_LABEL -
Python/compile.c - NO_LOCATION -
Python/dynload_shlib.c - _PyImport_DynLoadFiletab -
Expand Down
Loading