diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py index b2aae774a122056..64282d0f2d0bcf6 100644 --- a/Lib/test/test_import/__init__.py +++ b/Lib/test/test_import/__init__.py @@ -2454,10 +2454,6 @@ def setUpClass(cls): # Start fresh. cls.clean_up() - @classmethod - def tearDownClass(cls): - restore__testsinglephase() - def tearDown(self): # Clean up the module. self.clean_up() @@ -3014,20 +3010,20 @@ def test_basic_multiple_interpreters_deleted_no_reset(self): # * alive in 0 interpreters # * module def in _PyRuntime.imports.extensions # * mod init func ran for the first time (since reset) - # * m_copy is NULL (claered when the interpreter was destroyed) + # * m_copy is still set (owned by main interpreter) # * module's global state was initialized, not reset # Use a subinterpreter that sticks around. loaded_interp1 = self.import_in_subinterp(interpid1) self.check_common(loaded_interp1) - self.check_semi_fresh(loaded_interp1, loaded_main, base) + self.check_copied(loaded_interp1, base) # At this point: # * alive in 1 interpreter (interp1) # * module def still in _PyRuntime.imports.extensions - # * mod init func ran for the second time (since reset) - # * m_copy was copied from interp1 (was NULL) - # * module's global state was updated, not reset + # * mod init func did not run again + # * m_copy was not changed + # * module's global state was not touched # Use a subinterpreter while the previous one is still alive. loaded_interp2 = self.import_in_subinterp(interpid2) @@ -3038,8 +3034,8 @@ def test_basic_multiple_interpreters_deleted_no_reset(self): # * alive in 2 interpreters (interp1, interp2) # * module def still in _PyRuntime.imports.extensions # * mod init func did not run again - # * m_copy was copied from interp2 (was from interp1) - # * module's global state was updated, not reset + # * m_copy was not changed + # * module's global state was not touched @requires_subinterpreters def test_basic_multiple_interpreters_reset_each(self): diff --git a/Lib/test/test_interpreters/__init__.py b/Lib/test/test_interpreters/__init__.py index 4b16ecc31156a51..52ff553f60d0d75 100644 --- a/Lib/test/test_interpreters/__init__.py +++ b/Lib/test/test_interpreters/__init__.py @@ -1,5 +1,6 @@ import os -from test.support import load_package_tests +from test.support import load_package_tests, Py_GIL_DISABLED -def load_tests(*args): - return load_package_tests(os.path.dirname(__file__), *args) +if not Py_GIL_DISABLED: + def load_tests(*args): + return load_package_tests(os.path.dirname(__file__), *args) diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-06-10-57-54.gh-issue-117953.DqCzIs.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-06-10-57-54.gh-issue-117953.DqCzIs.rst new file mode 100644 index 000000000000000..6b69c9311013bb8 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-06-10-57-54.gh-issue-117953.DqCzIs.rst @@ -0,0 +1,8 @@ +When a builtin or extension module is imported for the first time, while a +subinterpreter is active, the module's init function is now run by the main +interpreter first before import continues in the subinterpreter. +Consequently, single-phase init modules now fail in an isolated +subinterpreter without the init function running under that interpreter, +whereas before it would run under the subinterpreter *before* failing, +potentially leaving behind global state and callbacks and otherwise leaving +the module in an inconsistent state. diff --git a/Python/import.c b/Python/import.c index 0b58567dde1fec5..ba44477318d4731 100644 --- a/Python/import.c +++ b/Python/import.c @@ -1154,9 +1154,6 @@ init_cached_m_dict(struct extensions_cache_value *value, PyObject *m_dict) return -1; } // XXX We may want to make the copy immortal. - // XXX This incref shouldn't be necessary. We are decref'ing - // one to many times somewhere. - Py_INCREF(copied); value->_m_dict = (struct cached_m_dict){ .copied=copied, @@ -1580,6 +1577,26 @@ _PyImport_CheckGILForModule(PyObject* module, PyObject *module_name) } #endif +static PyThreadState * +switch_to_main_interpreter(PyThreadState *tstate) +{ + if (_Py_IsMainInterpreter(tstate->interp)) { + return tstate; + } + PyThreadState *main_tstate = PyThreadState_New(_PyInterpreterState_Main()); + if (main_tstate == NULL) { + return NULL; + } + main_tstate->_whence = _PyThreadState_WHENCE_EXEC; +#ifndef NDEBUG + PyThreadState *old_tstate = PyThreadState_Swap(main_tstate); + assert(old_tstate == tstate); +#else + (void)PyThreadState_Swap(main_tstate); +#endif + return main_tstate; +} + static PyObject * get_core_module_dict(PyInterpreterState *interp, PyObject *name, PyObject *path) @@ -1936,24 +1953,171 @@ import_run_extension(PyThreadState *tstate, PyModInitFunction p0, struct extensions_cache_value *cached = NULL; const char *name_buf = PyBytes_AS_STRING(info->name_encoded); + /* We cannot know if the module is single-phase init or + * multi-phase init until after we call its init function. Even + * in isolated interpreters (that do not support single-phase init), + * the init function will run without restriction. For multi-phase + * init modules that isn't a problem because the init function only + * runs PyModuleDef_Init() on the module's def and then returns it. + * + * However, for single-phase init the module's init function will + * create the module, create other objects (and allocate other + * memory), populate it and its module state, and initialze static + * types. Some modules store other objects and data in global C + * variables and register callbacks with the runtime/stdlib or + * even external libraries (which is part of why we can't just + * dlclose() the module in the error case). That's a problem + * for isolated interpreters since all of the above happens + * and only then * will the import fail. Memory will leak, + * callbacks will still get used, and sometimes there + * will be crashes (memory access violations + * and use-after-free). + * + * To put it another way, if the module is single-phase init + * then the import will probably break interpreter isolation + * and should fail ASAP. However, the module's init function + * will still get run. That means it may still store state + * in the shared-object/DLL address space (which never gets + * closed/cleared), including objects (e.g. static types). + * This is a problem for isolated subinterpreters since each + * has its own object allocator. If the loaded shared-object + * still holds a reference to an object after the corresponding + * interpreter has finalized then either we must let it leak + * or else any later use of that object by another interpreter + * (or across multiple init-fini cycles) will crash the process. + * + * To avoid all of that, we make sure the module's init function + * is always run first with the main interpreter active. If it was + * already the main interpreter then we can continue loading the + * module like normal. Otherwise, right after the init function, + * we take care of some import state bookkeeping, switch back + * to the subinterpreter, check for single-phase init, + * and then continue loading like normal. */ + + bool switched = false; + /* We *could* leave in place a legacy interpreter here + * (one that shares obmalloc/GIL with main interp), + * but there isn't a big advantage, we anticipate + * such interpreters will be increasingly uncommon, + * and the code is a bit simpler if we always switch + * to the main interpreter. */ + PyThreadState *main_tstate = switch_to_main_interpreter(tstate); + if (main_tstate == NULL) { + return NULL; + } + else if (main_tstate != tstate) { + switched = true; + /* In the switched case, we could play it safe + * by getting the main interpreter's import lock here. + * It's unlikely to matter though. */ + } + struct _Py_ext_module_loader_result res; - if (_PyImport_RunModInitFunc(p0, info, &res) < 0) { + int rc = _PyImport_RunModInitFunc(p0, info, &res); + if (rc < 0) { /* We discard res.def. */ assert(res.module == NULL); - _Py_ext_module_loader_result_apply_error(&res, name_buf); - return NULL; } - assert(!PyErr_Occurred()); - assert(res.err == NULL); + else { + assert(!PyErr_Occurred()); + assert(res.err == NULL); + + mod = res.module; + res.module = NULL; + def = res.def; + assert(def != NULL); + + /* Do anything else that should be done + * while still using the main interpreter. */ + if (res.kind == _Py_ext_module_kind_SINGLEPHASE) { + /* Remember the filename as the __file__ attribute */ + if (info->filename != NULL) { + // XXX There's a refleak somewhere with the filename. + // Until we can track it down, we intern it. + PyObject *filename = Py_NewRef(info->filename); + PyUnicode_InternInPlace(&filename); + if (PyModule_AddObjectRef(mod, "__file__", filename) < 0) { + PyErr_Clear(); /* Not important enough to report */ + } + } + + /* Update global import state. */ + assert(def->m_base.m_index != 0); + struct singlephase_global_update singlephase = { + // XXX Modules that share a def should each get their own index, + // whereas currently they share (which means the per-interpreter + // cache is less reliable than it should be). + .m_index=def->m_base.m_index, + .origin=info->origin, +#ifdef Py_GIL_DISABLED + .md_gil=((PyModuleObject *)mod)->md_gil, +#endif + }; + // gh-88216: Extensions and def->m_base.m_copy can be updated + // when the extension module doesn't support sub-interpreters. + if (def->m_size == -1) { + /* We will reload from m_copy. */ + assert(def->m_base.m_init == NULL); + singlephase.m_dict = PyModule_GetDict(mod); + assert(singlephase.m_dict != NULL); + } + else { + /* We will reload via the init function. */ + assert(def->m_size >= 0); + assert(def->m_base.m_copy == NULL); + singlephase.m_init = p0; + } + cached = update_global_state_for_extension( + tstate, info->path, info->name, def, &singlephase); + if (cached == NULL) { + assert(PyErr_Occurred()); + goto main_finally; + } + } + } - mod = res.module; - res.module = NULL; - def = res.def; - assert(def != NULL); +main_finally: + /* Switch back to the subinterpreter. */ + if (switched) { + assert(main_tstate != tstate); + + /* Handle any exceptions, which we cannot propagate directly + * to the subinterpreter. */ + if (PyErr_Occurred()) { + if (PyErr_ExceptionMatches(PyExc_MemoryError)) { + /* We trust it will be caught again soon. */ + PyErr_Clear(); + } + else { + /* Printing the exception should be sufficient. */ + PyErr_PrintEx(0); + } + } + + /* Any module we got from the init function will have to be + * reloaded in the subinterpreter. */ + Py_CLEAR(mod); + + PyThreadState_Clear(main_tstate); + (void)PyThreadState_Swap(tstate); + PyThreadState_Delete(main_tstate); + } + + /*****************************************************************/ + /* At this point we are back to the interpreter we started with. */ + /*****************************************************************/ + + /* Finally we handle the error return from _PyImport_RunModInitFunc(). */ + if (rc < 0) { + _Py_ext_module_loader_result_apply_error(&res, name_buf); + goto error; + } if (res.kind == _Py_ext_module_kind_MULTIPHASE) { assert_multiphase_def(def); assert(mod == NULL); + /* Note that we cheat a little by not repeating the calls + * to _PyImport_GetModInitFunc() and _PyImport_RunModInitFunc(). */ mod = PyModule_FromDefAndSpec(def, spec); if (mod == NULL) { goto error; @@ -1962,57 +2126,35 @@ import_run_extension(PyThreadState *tstate, PyModInitFunction p0, else { assert(res.kind == _Py_ext_module_kind_SINGLEPHASE); assert_singlephase_def(def); - assert(PyModule_Check(mod)); if (_PyImport_CheckSubinterpIncompatibleExtensionAllowed(name_buf) < 0) { goto error; } + assert(!PyErr_Occurred()); - /* Remember the filename as the __file__ attribute */ - if (info->filename != NULL) { - if (PyModule_AddObjectRef(mod, "__file__", info->filename) < 0) { - PyErr_Clear(); /* Not important enough to report */ + if (switched) { + /* We switched to the main interpreter to run the init + * function, so now we will "reload" the module from the + * cached data using the original subinterpreter. */ + assert(mod == NULL); + mod = reload_singlephase_extension(tstate, cached, info); + if (mod == NULL) { + goto error; } - } - - /* Update global import state. */ - assert(def->m_base.m_index != 0); - struct singlephase_global_update singlephase = { - // XXX Modules that share a def should each get their own index, - // whereas currently they share (which means the per-interpreter - // cache is less reliable than it should be). - .m_index=def->m_base.m_index, - .origin=info->origin, -#ifdef Py_GIL_DISABLED - .md_gil=((PyModuleObject *)mod)->md_gil, -#endif - }; - // gh-88216: Extensions and def->m_base.m_copy can be updated - // when the extension module doesn't support sub-interpreters. - if (def->m_size == -1) { - /* We will reload from m_copy. */ - assert(def->m_base.m_init == NULL); - singlephase.m_dict = PyModule_GetDict(mod); - assert(singlephase.m_dict != NULL); + assert(!PyErr_Occurred()); + assert(PyModule_Check(mod)); } else { - /* We will reload via the init function. */ - assert(def->m_size >= 0); - assert(def->m_base.m_copy == NULL); - singlephase.m_init = p0; - } - cached = update_global_state_for_extension( - tstate, info->path, info->name, def, &singlephase); - if (cached == NULL) { - goto error; - } - - /* Update per-interpreter import state. */ - PyObject *modules = get_modules_dict(tstate, true); - if (finish_singlephase_extension( - tstate, mod, cached, info->name, modules) < 0) - { - goto error; + assert(mod != NULL); + assert(PyModule_Check(mod)); + + /* Update per-interpreter import state. */ + PyObject *modules = get_modules_dict(tstate, true); + if (finish_singlephase_extension( + tstate, mod, cached, info->name, modules) < 0) + { + goto error; + } } }