Skip to content

Commit 2f9ada9

Browse files
authored
bpo-40521: Make Unicode latin1 singletons per interpreter (GH-21101)
Each interpreter now has its own Unicode latin1 singletons. Remove "ifdef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS" and "ifdef LATIN1_SINGLETONS": always enable latin1 singletons. Optimize unicode_result_ready(): only attempt to get a latin1 singleton for PyUnicode_1BYTE_KIND.
1 parent bbf36e8 commit 2f9ada9

File tree

3 files changed

+36
-43
lines changed

3 files changed

+36
-43
lines changed

Include/internal/pycore_interp.h

+3
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ struct _Py_bytes_state {
7373
struct _Py_unicode_state {
7474
// The empty Unicode object is a singleton to improve performance.
7575
PyObject *empty;
76+
/* Single character Unicode strings in the Latin-1 range are being
77+
shared as well. */
78+
PyObject *latin1[256];
7679
struct _Py_unicode_fs_codec fs_codec;
7780
};
7881

Misc/NEWS.d/next/Core and Builtins/2020-05-20-01-17-34.bpo-40521.wvAehI.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Each interpreter now its has own free lists, singletons and caches:
33
* Free lists: float, tuple, list, dict, frame, context,
44
asynchronous generator, MemoryError.
55
* Singletons: empty tuple, empty bytes string, empty Unicode string,
6-
single byte character.
6+
single byte character, single Unicode (latin1) character.
77
* Slice cache.
88

99
They are no longer shared by all interpreters.

Objects/unicodeobject.c

+32-42
Original file line numberDiff line numberDiff line change
@@ -303,17 +303,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
303303
/* List of static strings. */
304304
static _Py_Identifier *static_strings = NULL;
305305

306-
/* bpo-40521: Latin1 singletons are shared by all interpreters. */
307-
#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
308-
# define LATIN1_SINGLETONS
309-
#endif
310-
311-
#ifdef LATIN1_SINGLETONS
312-
/* Single character Unicode strings in the Latin-1 range are being
313-
shared as well. */
314-
static PyObject *unicode_latin1[256] = {NULL};
315-
#endif
316-
317306
/* Fast detection of the most frequent whitespace characters */
318307
const unsigned char _Py_ascii_whitespace[] = {
319308
0, 0, 0, 0, 0, 0, 0, 0,
@@ -657,9 +646,8 @@ unicode_result_wchar(PyObject *unicode)
657646
if (len == 1) {
658647
wchar_t ch = _PyUnicode_WSTR(unicode)[0];
659648
if ((Py_UCS4)ch < 256) {
660-
PyObject *latin1_char = get_latin1_char((unsigned char)ch);
661649
Py_DECREF(unicode);
662-
return latin1_char;
650+
return get_latin1_char((unsigned char)ch);
663651
}
664652
}
665653

@@ -692,13 +680,13 @@ unicode_result_ready(PyObject *unicode)
692680
return empty;
693681
}
694682

695-
#ifdef LATIN1_SINGLETONS
696683
if (length == 1) {
697-
const void *data = PyUnicode_DATA(unicode);
698684
int kind = PyUnicode_KIND(unicode);
699-
Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
700-
if (ch < 256) {
701-
PyObject *latin1_char = unicode_latin1[ch];
685+
if (kind == PyUnicode_1BYTE_KIND) {
686+
Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
687+
Py_UCS1 ch = data[0];
688+
struct _Py_unicode_state *state = get_unicode_state();
689+
PyObject *latin1_char = state->latin1[ch];
702690
if (latin1_char != NULL) {
703691
if (unicode != latin1_char) {
704692
Py_INCREF(latin1_char);
@@ -709,12 +697,14 @@ unicode_result_ready(PyObject *unicode)
709697
else {
710698
assert(_PyUnicode_CheckConsistency(unicode, 1));
711699
Py_INCREF(unicode);
712-
unicode_latin1[ch] = unicode;
700+
state->latin1[ch] = unicode;
713701
return unicode;
714702
}
715703
}
704+
else {
705+
assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
706+
}
716707
}
717-
#endif
718708

719709
assert(_PyUnicode_CheckConsistency(unicode, 1));
720710
return unicode;
@@ -1981,18 +1971,18 @@ unicode_dealloc(PyObject *unicode)
19811971
static int
19821972
unicode_is_singleton(PyObject *unicode)
19831973
{
1984-
if (unicode == unicode_get_empty()) {
1974+
struct _Py_unicode_state *state = get_unicode_state();
1975+
if (unicode == state->empty) {
19851976
return 1;
19861977
}
1987-
#ifdef LATIN1_SINGLETONS
19881978
PyASCIIObject *ascii = (PyASCIIObject *)unicode;
19891979
if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
19901980
{
19911981
Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1992-
if (ch < 256 && unicode_latin1[ch] == unicode)
1982+
if (ch < 256 && state->latin1[ch] == unicode) {
19931983
return 1;
1984+
}
19941985
}
1995-
#endif
19961986
return 0;
19971987
}
19981988
#endif
@@ -2130,17 +2120,15 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
21302120
}
21312121

21322122
static PyObject*
2133-
get_latin1_char(unsigned char ch)
2123+
get_latin1_char(Py_UCS1 ch)
21342124
{
2135-
PyObject *unicode;
2125+
struct _Py_unicode_state *state = get_unicode_state();
21362126

2137-
#ifdef LATIN1_SINGLETONS
2138-
unicode = unicode_latin1[ch];
2127+
PyObject *unicode = state->latin1[ch];
21392128
if (unicode) {
21402129
Py_INCREF(unicode);
21412130
return unicode;
21422131
}
2143-
#endif
21442132

21452133
unicode = PyUnicode_New(1, ch);
21462134
if (!unicode) {
@@ -2150,10 +2138,8 @@ get_latin1_char(unsigned char ch)
21502138
PyUnicode_1BYTE_DATA(unicode)[0] = ch;
21512139
assert(_PyUnicode_CheckConsistency(unicode, 1));
21522140

2153-
#ifdef LATIN1_SINGLETONS
21542141
Py_INCREF(unicode);
2155-
unicode_latin1[ch] = unicode;
2156-
#endif
2142+
state->latin1[ch] = unicode;
21572143
return unicode;
21582144
}
21592145

@@ -2164,8 +2150,9 @@ unicode_char(Py_UCS4 ch)
21642150

21652151
assert(ch <= MAX_UNICODE);
21662152

2167-
if (ch < 256)
2153+
if (ch < 256) {
21682154
return get_latin1_char(ch);
2155+
}
21692156

21702157
unicode = PyUnicode_New(1, ch);
21712158
if (unicode == NULL)
@@ -2367,11 +2354,13 @@ _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
23672354
PyObject *res;
23682355
unsigned char max_char;
23692356

2370-
if (size == 0)
2357+
if (size == 0) {
23712358
_Py_RETURN_UNICODE_EMPTY();
2359+
}
23722360
assert(size > 0);
2373-
if (size == 1)
2361+
if (size == 1) {
23742362
return get_latin1_char(u[0]);
2363+
}
23752364

23762365
max_char = ucs1lib_find_max_char(u, u + size);
23772366
res = PyUnicode_New(size, max_char);
@@ -5008,8 +4997,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
50084997

50094998
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
50104999
if (size == 1 && (unsigned char)s[0] < 128) {
5011-
if (consumed)
5000+
if (consumed) {
50125001
*consumed = 1;
5002+
}
50135003
return get_latin1_char((unsigned char)s[0]);
50145004
}
50155005

@@ -7176,8 +7166,9 @@ PyUnicode_DecodeASCII(const char *s,
71767166
_Py_RETURN_UNICODE_EMPTY();
71777167

71787168
/* ASCII is equivalent to the first 128 ordinals in Unicode. */
7179-
if (size == 1 && (unsigned char)s[0] < 128)
7169+
if (size == 1 && (unsigned char)s[0] < 128) {
71807170
return get_latin1_char((unsigned char)s[0]);
7171+
}
71817172

71827173
// Shortcut for simple case
71837174
PyObject *u = PyUnicode_New(size, 127);
@@ -16234,12 +16225,11 @@ _PyUnicode_Fini(PyThreadState *tstate)
1623416225

1623516226
Py_CLEAR(state->empty);
1623616227

16228+
for (Py_ssize_t i = 0; i < 256; i++) {
16229+
Py_CLEAR(state->latin1[i]);
16230+
}
16231+
1623716232
if (is_main_interp) {
16238-
#ifdef LATIN1_SINGLETONS
16239-
for (Py_ssize_t i = 0; i < 256; i++) {
16240-
Py_CLEAR(unicode_latin1[i]);
16241-
}
16242-
#endif
1624316233
unicode_clear_static_strings();
1624416234
}
1624516235

0 commit comments

Comments
 (0)