Skip to content

bpo-42208: Add _Py_GetLocaleEncoding() #23050

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Include/internal/pycore_fileutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(

PyAPI_FUNC(void) _Py_closerange(int first, int last);

PyAPI_FUNC(PyObject*) _Py_GetLocaleEncoding(void);

#ifdef __cplusplus
}
#endif
Expand Down
25 changes: 0 additions & 25 deletions Modules/_io/_iomodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -593,31 +593,6 @@ _PyIO_get_module_state(void)
return state;
}

PyObject *
_PyIO_get_locale_module(_PyIO_State *state)
{
PyObject *mod;
if (state->locale_module != NULL) {
assert(PyWeakref_CheckRef(state->locale_module));
mod = PyWeakref_GET_OBJECT(state->locale_module);
if (mod != Py_None) {
Py_INCREF(mod);
return mod;
}
Py_CLEAR(state->locale_module);
}
mod = PyImport_ImportModule("_bootlocale");
if (mod == NULL)
return NULL;
state->locale_module = PyWeakref_NewRef(mod, NULL);
if (state->locale_module == NULL) {
Py_DECREF(mod);
return NULL;
}
return mod;
}


static int
iomodule_traverse(PyObject *mod, visitproc visit, void *arg) {
_PyIO_State *state = get_io_state(mod);
Expand Down
1 change: 0 additions & 1 deletion Modules/_io/_iomodule.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ typedef struct {
#define IO_STATE() _PyIO_get_module_state()

extern _PyIO_State *_PyIO_get_module_state(void);
extern PyObject *_PyIO_get_locale_module(_PyIO_State *);

#ifdef MS_WINDOWS
extern char _PyIO_get_console_type(PyObject *);
Expand Down
26 changes: 4 additions & 22 deletions Modules/_io/textio.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "Python.h"
#include "pycore_interp.h" // PyInterpreterState.fs_codec
#include "pycore_long.h" // _PyLong_GetZero()
#include "pycore_fileutils.h" // _Py_GetLocaleEncoding()
#include "pycore_object.h"
#include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "structmember.h" // PyMemberDef
Expand All @@ -27,7 +28,6 @@ _Py_IDENTIFIER(_dealloc_warn);
_Py_IDENTIFIER(decode);
_Py_IDENTIFIER(fileno);
_Py_IDENTIFIER(flush);
_Py_IDENTIFIER(getpreferredencoding);
_Py_IDENTIFIER(isatty);
_Py_IDENTIFIER(mode);
_Py_IDENTIFIER(name);
Expand Down Expand Up @@ -1155,29 +1155,11 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
}
}
if (encoding == NULL && self->encoding == NULL) {
PyObject *locale_module = _PyIO_get_locale_module(state);
if (locale_module == NULL)
goto catch_ImportError;
self->encoding = _PyObject_CallMethodIdOneArg(
locale_module, &PyId_getpreferredencoding, Py_False);
Py_DECREF(locale_module);
self->encoding = _Py_GetLocaleEncoding();
if (self->encoding == NULL) {
catch_ImportError:
/*
Importing locale can raise an ImportError because of
_functools, and locale.getpreferredencoding can raise an
ImportError if _locale is not available. These will happen
during module building.
*/
if (PyErr_ExceptionMatches(PyExc_ImportError)) {
PyErr_Clear();
self->encoding = PyUnicode_FromString("ascii");
}
else
goto error;
goto error;
}
else if (!PyUnicode_Check(self->encoding))
Py_CLEAR(self->encoding);
assert(PyUnicode_Check(self->encoding));
}
if (self->encoding != NULL) {
encoding = PyUnicode_AsUTF8(self->encoding);
Expand Down
43 changes: 42 additions & 1 deletion Python/fileutils.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "Python.h"
#include "pycore_fileutils.h"
#include "pycore_fileutils.h" // fileutils definitions
#include "pycore_runtime.h" // _PyRuntime
#include "osdefs.h" // SEP
#include <locale.h>

Expand Down Expand Up @@ -820,6 +821,46 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
}


// Get the current locale encoding: locale.getpreferredencoding(False).
// See also config_get_locale_encoding()
PyObject *
_Py_GetLocaleEncoding(void)
{
#ifdef _Py_FORCE_UTF8_LOCALE
// On Android langinfo.h and CODESET are missing,
// and UTF-8 is always used in mbstowcs() and wcstombs().
return PyUnicode_FromString("UTF-8");
#else
const PyPreConfig *preconfig = &_PyRuntime.preconfig;
if (preconfig->utf8_mode) {
return PyUnicode_FromString("UTF-8");
}

#if defined(MS_WINDOWS)
return PyUnicode_FromFormat("cp%u", GetACP());
#else
const char *encoding = nl_langinfo(CODESET);
if (!encoding || encoding[0] == '\0') {
#ifdef _Py_FORCE_UTF8_FS_ENCODING
// nl_langinfo() can return an empty string when the LC_CTYPE locale is
// not supported. Default to UTF-8 in that case, because UTF-8 is the
// default charset on macOS.
encoding = "UTF-8";
#else
PyErr_SetString(PyExc_ValueError,
"failed to get the locale encoding: "
"nl_langinfo(CODESET) returns an empty string");
return NULL;
#endif
}
// Decode from UTF-8
return PyUnicode_FromString(encoding);
#endif // !CODESET

#endif
}


#ifdef MS_WINDOWS
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */

Expand Down
125 changes: 64 additions & 61 deletions Python/initconfig.c
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,7 @@ config_set_bytes_string(PyConfig *config, wchar_t **config_str,
configured. */
PyStatus
PyConfig_SetBytesString(PyConfig *config, wchar_t **config_str,
const char *str)
const char *str)
{
return CONFIG_SET_BYTES_STR(config, config_str, str, "string");
}
Expand Down Expand Up @@ -1466,8 +1466,13 @@ config_read_complex_options(PyConfig *config)


static const wchar_t *
config_get_stdio_errors(void)
config_get_stdio_errors(const PyPreConfig *preconfig)
{
if (preconfig->utf8_mode) {
/* UTF-8 Mode uses UTF-8/surrogateescape */
return L"surrogateescape";
}

#ifndef MS_WINDOWS
const char *loc = setlocale(LC_CTYPE, NULL);
if (loc != NULL) {
Expand All @@ -1492,26 +1497,41 @@ config_get_stdio_errors(void)
}


// See also _Py_GetLocaleEncoding() and config_get_fs_encoding()
static PyStatus
config_get_locale_encoding(PyConfig *config, wchar_t **locale_encoding)
config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig,
wchar_t **locale_encoding)
{
#ifdef _Py_FORCE_UTF8_LOCALE
return PyConfig_SetString(config, locale_encoding, L"utf-8");
#else
if (preconfig->utf8_mode) {
return PyConfig_SetString(config, locale_encoding, L"utf-8");
}

#ifdef MS_WINDOWS
char encoding[20];
PyOS_snprintf(encoding, sizeof(encoding), "cp%u", GetACP());
return PyConfig_SetBytesString(config, locale_encoding, encoding);
#elif defined(_Py_FORCE_UTF8_LOCALE)
return PyConfig_SetString(config, locale_encoding, L"utf-8");
#else
const char *encoding = nl_langinfo(CODESET);
if (!encoding || encoding[0] == '\0') {
#ifdef _Py_FORCE_UTF8_FS_ENCODING
// nl_langinfo() can return an empty string when the LC_CTYPE locale is
// not supported. Default to UTF-8 in that case, because UTF-8 is the
// default charset on macOS.
encoding = "UTF-8";
#else
return _PyStatus_ERR("failed to get the locale encoding: "
"nl_langinfo(CODESET) failed");
"nl_langinfo(CODESET) returns an empty string");
#endif
}
/* nl_langinfo(CODESET) is decoded by Py_DecodeLocale() */
return CONFIG_SET_BYTES_STR(config,
locale_encoding, encoding,
"nl_langinfo(CODESET)");
#endif
#endif // !MS_WINDOWS
#endif // !_Py_FORCE_UTF8_LOCALE
}


Expand Down Expand Up @@ -1596,33 +1616,16 @@ config_init_stdio_encoding(PyConfig *config,
PyMem_RawFree(pythonioencoding);
}

/* UTF-8 Mode uses UTF-8/surrogateescape */
if (preconfig->utf8_mode) {
if (config->stdio_encoding == NULL) {
status = PyConfig_SetString(config, &config->stdio_encoding,
L"utf-8");
if (_PyStatus_EXCEPTION(status)) {
return status;
}
}
if (config->stdio_errors == NULL) {
status = PyConfig_SetString(config, &config->stdio_errors,
L"surrogateescape");
if (_PyStatus_EXCEPTION(status)) {
return status;
}
}
}

/* Choose the default error handler based on the current locale. */
if (config->stdio_encoding == NULL) {
status = config_get_locale_encoding(config, &config->stdio_encoding);
status = config_get_locale_encoding(config, preconfig,
&config->stdio_encoding);
if (_PyStatus_EXCEPTION(status)) {
return status;
}
}
if (config->stdio_errors == NULL) {
const wchar_t *errors = config_get_stdio_errors();
const wchar_t *errors = config_get_stdio_errors(preconfig);
assert(errors != NULL);

status = PyConfig_SetString(config, &config->stdio_errors, errors);
Expand All @@ -1635,46 +1638,46 @@ config_init_stdio_encoding(PyConfig *config,
}


// See also config_get_locale_encoding()
static PyStatus
config_get_fs_encoding(PyConfig *config, const PyPreConfig *preconfig,
wchar_t **fs_encoding)
{
#ifdef _Py_FORCE_UTF8_FS_ENCODING
return PyConfig_SetString(config, fs_encoding, L"utf-8");
#elif defined(MS_WINDOWS)
const wchar_t *encoding;
if (preconfig->legacy_windows_fs_encoding) {
// Legacy Windows filesystem encoding: mbcs/replace
encoding = L"mbcs";
}
else {
// Windows defaults to utf-8/surrogatepass (PEP 529)
encoding = L"utf-8";
}
return PyConfig_SetString(config, fs_encoding, encoding);
#else // !MS_WINDOWS
if (preconfig->utf8_mode) {
return PyConfig_SetString(config, fs_encoding, L"utf-8");
}
else if (_Py_GetForceASCII()) {
return PyConfig_SetString(config, fs_encoding, L"ascii");
}
else {
return config_get_locale_encoding(config, preconfig, fs_encoding);
}
#endif // !MS_WINDOWS
}


static PyStatus
config_init_fs_encoding(PyConfig *config, const PyPreConfig *preconfig)
{
PyStatus status;

if (config->filesystem_encoding == NULL) {
#ifdef _Py_FORCE_UTF8_FS_ENCODING
status = PyConfig_SetString(config, &config->filesystem_encoding, L"utf-8");
#else

#ifdef MS_WINDOWS
if (preconfig->legacy_windows_fs_encoding) {
/* Legacy Windows filesystem encoding: mbcs/replace */
status = PyConfig_SetString(config, &config->filesystem_encoding,
L"mbcs");
}
else
#endif
if (preconfig->utf8_mode) {
status = PyConfig_SetString(config, &config->filesystem_encoding,
L"utf-8");
}
#ifndef MS_WINDOWS
else if (_Py_GetForceASCII()) {
status = PyConfig_SetString(config, &config->filesystem_encoding,
L"ascii");
}
#endif
else {
#ifdef MS_WINDOWS
/* Windows defaults to utf-8/surrogatepass (PEP 529). */
status = PyConfig_SetString(config, &config->filesystem_encoding,
L"utf-8");
#else
status = config_get_locale_encoding(config,
&config->filesystem_encoding);
#endif
}
#endif /* !_Py_FORCE_UTF8_FS_ENCODING */

status = config_get_fs_encoding(config, preconfig,
&config->filesystem_encoding);
if (_PyStatus_EXCEPTION(status)) {
return status;
}
Expand Down