Skip to content

Commit eb88f21

Browse files
authored
gh-89653: PEP 670: Convert unicodeobject.h macros to functions (#92648)
Convert the following Unicode macros to static inline functions. Surrogate functions: * Py_UNICODE_IS_SURROGATE() * Py_UNICODE_IS_HIGH_SURROGATE() * Py_UNICODE_IS_LOW_SURROGATE() * Py_UNICODE_HIGH_SURROGATE() * Py_UNICODE_LOW_SURROGATE() * Py_UNICODE_JOIN_SURROGATES() "Is" functions: * Py_UNICODE_ISALNUM() * Py_UNICODE_ISSPACE() In the implementation of these functions, the character type is now well defined to Py_UCS4.
1 parent 1d1929f commit eb88f21

File tree

1 file changed

+66
-48
lines changed

1 file changed

+66
-48
lines changed

Include/cpython/unicodeobject.h

+66-48
Original file line numberDiff line numberDiff line change
@@ -15,53 +15,31 @@
1515
# define USE_UNICODE_WCHAR_CACHE 1
1616
#endif /* USE_UNICODE_WCHAR_CACHE */
1717

18-
/* Since splitting on whitespace is an important use case, and
19-
whitespace in most situations is solely ASCII whitespace, we
20-
optimize for the common case by using a quick look-up table
21-
_Py_ascii_whitespace (see below) with an inlined check.
22-
23-
*/
24-
#define Py_UNICODE_ISSPACE(ch) \
25-
((Py_UCS4)(ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
26-
27-
#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
28-
#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
29-
#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
30-
#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
31-
32-
#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
33-
#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
34-
#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
35-
36-
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
37-
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
38-
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
39-
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
18+
// Static inline functions to work with surrogates
19+
static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
20+
return (0xD800 <= ch && ch <= 0xDFFF);
21+
}
22+
static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
23+
return (0xD800 <= ch && ch <= 0xDBFF);
24+
}
25+
static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
26+
return (0xDC00 <= ch && ch <= 0xDFFF);
27+
}
4028

41-
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
42-
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
43-
#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
29+
// Join two surrogate characters and return a single Py_UCS4 value.
30+
static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) {
31+
return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
32+
}
4433

45-
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
34+
// High surrogate = top 10 bits added to D800
35+
static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
36+
return (0xD800 - (0x10000 >> 10) + (ch >> 10));
37+
}
4638

47-
#define Py_UNICODE_ISALNUM(ch) \
48-
(Py_UNICODE_ISALPHA(ch) || \
49-
Py_UNICODE_ISDECIMAL(ch) || \
50-
Py_UNICODE_ISDIGIT(ch) || \
51-
Py_UNICODE_ISNUMERIC(ch))
52-
53-
/* macros to work with surrogates */
54-
#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
55-
#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
56-
#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
57-
/* Join two surrogate characters and return a single Py_UCS4 value. */
58-
#define Py_UNICODE_JOIN_SURROGATES(high, low) \
59-
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
60-
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
61-
/* high surrogate = top 10 bits added to D800 */
62-
#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
63-
/* low surrogate = bottom 10 bits added to DC00 */
64-
#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
39+
// Low surrogate = bottom 10 bits added to DC00
40+
static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
41+
return (0xDC00 + (ch & 0x3FF));
42+
}
6543

6644
/* --- Unicode Type ------------------------------------------------------- */
6745

@@ -1013,10 +991,6 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1013991

1014992
/* === Characters Type APIs =============================================== */
1015993

1016-
/* Helper array used by Py_UNICODE_ISSPACE(). */
1017-
1018-
PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1019-
1020994
/* These should not be used directly. Use the Py_UNICODE_IS* and
1021995
Py_UNICODE_TO* macros instead.
1022996
@@ -1124,6 +1098,50 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
11241098
Py_UCS4 ch /* Unicode character */
11251099
);
11261100

1101+
// Helper array used by Py_UNICODE_ISSPACE().
1102+
PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1103+
1104+
// Since splitting on whitespace is an important use case, and
1105+
// whitespace in most situations is solely ASCII whitespace, we
1106+
// optimize for the common case by using a quick look-up table
1107+
// _Py_ascii_whitespace (see below) with an inlined check.
1108+
static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
1109+
if (ch < 128) {
1110+
return _Py_ascii_whitespace[ch];
1111+
}
1112+
return _PyUnicode_IsWhitespace(ch);
1113+
}
1114+
1115+
#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
1116+
#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
1117+
#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
1118+
#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
1119+
1120+
#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
1121+
#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
1122+
#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
1123+
1124+
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
1125+
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
1126+
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
1127+
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
1128+
1129+
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
1130+
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
1131+
#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
1132+
1133+
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
1134+
1135+
static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
1136+
return (Py_UNICODE_ISALPHA(ch)
1137+
|| Py_UNICODE_ISDECIMAL(ch)
1138+
|| Py_UNICODE_ISDIGIT(ch)
1139+
|| Py_UNICODE_ISNUMERIC(ch));
1140+
}
1141+
1142+
1143+
/* === Misc functions ===================================================== */
1144+
11271145
PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
11281146

11291147
/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/

0 commit comments

Comments
 (0)