Skip to content

Commit e507b4a

Browse files
committed
gh-139353: Split unicodeobject.c into smaller files
Create files: * Objects/unicode_codecs.c * Objects/unicode_codecs_utf.c * Objects/unicode_codecs_win.c * Objects/unicode_convert.c * Objects/unicode_format.c * Objects/unicode_fromformat.c * Objects/unicode_iter.c * Objects/unicode_methods.c * Objects/unicode_module.c * Objects/unicode_writer.c
1 parent 8d83b7d commit e507b4a

24 files changed

+15259
-14754
lines changed

Include/internal/pycore_fileutils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ extern "C" {
2222

2323
// Export for '_testinternalcapi' shared extension
2424
PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors);
25+
extern _Py_error_handler _Py_GetErrorHandlerWide(const wchar_t *errors);
2526

2627
// Export for '_testinternalcapi' shared extension
2728
PyAPI_FUNC(int) _Py_DecodeLocaleEx(

Include/internal/pycore_unicodeobject.h

Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,257 @@ extern "C" {
1111
#include "pycore_fileutils.h" // _Py_error_handler
1212
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
1313

14+
// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
15+
// The value must be the same in fileutils.c.
16+
#define _Py_MAX_UNICODE 0x10ffff
17+
18+
#define _Py_LEFTSTRIP 0
19+
#define _Py_RIGHTSTRIP 1
20+
#define _Py_BOTHSTRIP 2
21+
22+
extern int _PyUnicode_CheckEncodingErrors(
23+
const char *encoding,
24+
const char *errors);
25+
extern PyObject* _PyUnicode_GetEmpty(void);
26+
extern PyObject* _PyUnicode_Result(PyObject *unicode);
27+
extern PyObject* _PyUnicode_ResultUnchanged(PyObject *unicode);
28+
extern Py_ssize_t _PyUnicode_FindChar(
29+
const void *s,
30+
int kind,
31+
Py_ssize_t size,
32+
Py_UCS4 ch,
33+
int direction);
34+
extern PyObject* _PyUnicode_GetLatin1Char(Py_UCS1 ch);
35+
extern char* PyUnicode_UTF8(PyObject *op);
36+
extern Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op);
37+
extern void _PyUnicode_SET_UTF8(PyObject *op, char *utf8);
38+
extern void _PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length);
39+
extern PyObject* _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size);
40+
extern PyObject* _PyUnicode_TranslateCharmap(
41+
PyObject *input,
42+
PyObject *mapping,
43+
const char *errors);
44+
extern int _PyUnicode_FillUTF8(PyObject *unicode);
45+
extern int _PyUnicode_DecodeUTF8Writer(
46+
_PyUnicodeWriter *writer,
47+
const char *s,
48+
Py_ssize_t size,
49+
_Py_error_handler error_handler,
50+
const char *errors,
51+
Py_ssize_t *consumed);
52+
extern int _Py_normalize_encoding(const char *, char *, size_t);
53+
extern void* _PyUnicode_AsKind(
54+
int skind,
55+
void const *data,
56+
Py_ssize_t len,
57+
int kind);
58+
extern int _PyUnicode_Tailmatch(
59+
PyObject *self,
60+
PyObject *substring,
61+
Py_ssize_t start,
62+
Py_ssize_t end,
63+
int direction);
64+
extern Py_ssize_t _PyUnicode_Count(
65+
PyObject *str,
66+
PyObject *substr,
67+
Py_ssize_t start,
68+
Py_ssize_t end);
69+
extern PyObject * _PyUnicode_Replace(
70+
PyObject *self,
71+
PyObject *str1,
72+
PyObject *str2,
73+
Py_ssize_t maxcount);
74+
extern Py_ssize_t _PyUnicode_AnylibFindSlice(
75+
PyObject* s1,
76+
PyObject* s2,
77+
Py_ssize_t start,
78+
Py_ssize_t end,
79+
int direction);
80+
extern int _PyUnicode_FindMaxCharSurrogates(
81+
const wchar_t *begin,
82+
const wchar_t *end,
83+
Py_UCS4 *maxchar,
84+
Py_ssize_t *num_surrogates);
85+
extern void _PyUnicode_WriteWideChar(
86+
int kind,
87+
void *data,
88+
const wchar_t *u,
89+
Py_ssize_t size,
90+
Py_ssize_t num_surrogates);
91+
extern PyObject* _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size);
92+
extern PyObject* _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size);
93+
extern PyObject* _PyUnicode_FromOrdinal(Py_UCS4 ordinal);
94+
extern PyObject* _PyUnicode_do_string_format(
95+
PyObject *self,
96+
PyObject *args,
97+
PyObject *kwargs);
98+
extern PyObject* _PyUnicode_do_string_format_map(
99+
PyObject *self,
100+
PyObject *obj);
101+
extern Py_hash_t _PyUnicode_Hash(PyObject *self);
102+
extern PyObject* _PyUnicode_Iter(PyObject *seq);
103+
extern int _PyUnicode_IsModifiable(PyObject *unicode);
104+
extern void _PyUnicode_Fill(
105+
int kind,
106+
void *data,
107+
Py_UCS4 value,
108+
Py_ssize_t start,
109+
Py_ssize_t length);
110+
extern PyObject* _PyUnicode_ResizeCompact(
111+
PyObject *unicode,
112+
Py_ssize_t length);
113+
extern int _PyUnicode_CheckModifiable(PyObject *unicode);
114+
extern PyObject* _PyUnicode_Repr(PyObject *unicode);
115+
extern PyObject* _PyUnicode_Pad(
116+
PyObject *self,
117+
Py_ssize_t left,
118+
Py_ssize_t right,
119+
Py_UCS4 fill);
120+
extern int _PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length);
121+
extern PyObject* _PyUnicode_EncodeUTF8(
122+
PyObject *unicode,
123+
_Py_error_handler error_handler,
124+
const char *errors);
125+
extern PyObject* _PyUnicode_DecodeUTF8(
126+
const char *s,
127+
Py_ssize_t size,
128+
_Py_error_handler error_handler,
129+
const char *errors,
130+
Py_ssize_t *consumed);
131+
extern char* _PyUnicode_Backslashreplace(
132+
PyBytesWriter *writer,
133+
char *str,
134+
PyObject *unicode,
135+
Py_ssize_t collstart,
136+
Py_ssize_t collend);
137+
extern char* _PyUnicode_Xmlcharrefreplace(
138+
PyBytesWriter *writer,
139+
char *str,
140+
PyObject *unicode,
141+
Py_ssize_t collstart,
142+
Py_ssize_t collend);
143+
extern PyObject* _PyUnicode_EncodeCallErrorHandler(
144+
const char *errors,
145+
PyObject **errorHandler,
146+
const char *encoding,
147+
const char *reason,
148+
PyObject *unicode,
149+
PyObject **exceptionObject,
150+
Py_ssize_t startpos,
151+
Py_ssize_t endpos,
152+
Py_ssize_t *newpos);
153+
extern void _PyUnicode_RaiseEncodeException(
154+
PyObject **exceptionObject,
155+
const char *encoding,
156+
PyObject *unicode,
157+
Py_ssize_t startpos,
158+
Py_ssize_t endpos,
159+
const char *reason);
160+
extern int _PyUnicode_DecodeCallErrorHandlerWriter(
161+
const char *errors,
162+
PyObject **errorHandler,
163+
const char *encoding,
164+
const char *reason,
165+
const char **input,
166+
const char **inend,
167+
Py_ssize_t *startinpos,
168+
Py_ssize_t *endinpos,
169+
PyObject **exceptionObject,
170+
const char **inptr,
171+
_PyUnicodeWriter *writer);
172+
extern PyObject* _PyUnicode_EncodeUCS1(
173+
PyObject *unicode,
174+
const char *errors,
175+
const Py_UCS4 limit);
176+
extern void _PyUnicode_InitGlobalState(void);
177+
extern PyObject* _PyUnicode_do_strip(PyObject *self, int striptype);
178+
extern PyObject* _PyUnicode_Split(
179+
PyObject *self,
180+
PyObject *substring,
181+
Py_ssize_t maxcount);
182+
extern PyObject* _PyUnicode_RSplit(
183+
PyObject *self,
184+
PyObject *substring,
185+
Py_ssize_t maxcount);
186+
extern PyObject* _PyUnicode_Maketrans(
187+
PyObject *x,
188+
PyObject *y,
189+
PyObject *z);
190+
extern PyObject* _PyUnicode_Expandtabs(
191+
PyObject *self,
192+
int tabsize);
193+
void _PyUnicode_MakeDecodeException(
194+
PyObject **exceptionObject,
195+
const char *encoding,
196+
const char *input, Py_ssize_t length,
197+
Py_ssize_t startpos, Py_ssize_t endpos,
198+
const char *reason);
199+
200+
extern PyTypeObject _Py_EncodingMapType;
201+
extern PyTypeObject _Py_FieldNameIter_Type;
202+
extern PyTypeObject _Py_FormatterIter_Type;
203+
204+
/* helper macro to fixup start/end slice values */
205+
#define _Py_ADJUST_INDICES(start, end, len) \
206+
do { \
207+
if (end > len) { \
208+
end = len; \
209+
} \
210+
else if (end < 0) { \
211+
end += len; \
212+
if (end < 0) { \
213+
end = 0; \
214+
} \
215+
} \
216+
if (start < 0) { \
217+
start += len; \
218+
if (start < 0) { \
219+
start = 0; \
220+
} \
221+
} \
222+
} while (0)
223+
224+
/* Generic helper macro to convert characters of different types.
225+
from_type and to_type have to be valid type names, begin and end
226+
are pointers to the source characters which should be of type
227+
"from_type *". to is a pointer of type "to_type *" and points to the
228+
buffer where the result characters are written to. */
229+
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
230+
do { \
231+
to_type *_to = (to_type *)(to); \
232+
const from_type *_iter = (const from_type *)(begin);\
233+
const from_type *_end = (const from_type *)(end);\
234+
Py_ssize_t n = (_end) - (_iter); \
235+
const from_type *_unrolled_end = \
236+
_iter + _Py_SIZE_ROUND_DOWN(n, 4); \
237+
while (_iter < (_unrolled_end)) { \
238+
_to[0] = (to_type) _iter[0]; \
239+
_to[1] = (to_type) _iter[1]; \
240+
_to[2] = (to_type) _iter[2]; \
241+
_to[3] = (to_type) _iter[3]; \
242+
_iter += 4; _to += 4; \
243+
} \
244+
while (_iter < (_end)) \
245+
*_to++ = (to_type) *_iter++; \
246+
} while (0)
247+
248+
#ifdef Py_DEBUG
249+
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
250+
#else
251+
# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
252+
#endif
253+
254+
static inline int
255+
_PyUnicode_Ensure(PyObject *obj)
256+
{
257+
if (!PyUnicode_Check(obj)) {
258+
PyErr_Format(PyExc_TypeError, "must be str, not %T", obj);
259+
return -1;
260+
}
261+
return 0;
262+
}
263+
264+
14265
/* --- Characters Type APIs ----------------------------------------------- */
15266

16267
extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
@@ -73,6 +324,17 @@ extern Py_UCS4 _PyUnicode_FindMaxChar (
73324

74325
/* --- _PyUnicodeWriter API ----------------------------------------------- */
75326

327+
static inline int
328+
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
329+
{
330+
assert(ch <= _Py_MAX_UNICODE);
331+
if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
332+
return -1;
333+
PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
334+
writer->pos++;
335+
return 0;
336+
}
337+
76338
/* Format the object based on the format_spec, as defined in PEP 3101
77339
(Advanced String Formatting). */
78340
extern int _PyUnicode_FormatAdvancedWriter(
@@ -88,6 +350,10 @@ extern int _PyUnicodeWriter_FormatV(
88350
const char *format,
89351
va_list vargs);
90352

353+
extern void _PyUnicodeWriter_InitWithBuffer(
354+
_PyUnicodeWriter *writer,
355+
PyObject *buffer);
356+
91357
/* --- UTF-7 Codecs ------------------------------------------------------- */
92358

93359
extern PyObject* _PyUnicode_EncodeUTF7(

Makefile.pre.in

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -558,8 +558,18 @@ OBJECT_OBJS= \
558558
Objects/tupleobject.o \
559559
Objects/typeobject.o \
560560
Objects/typevarobject.o \
561-
Objects/unicodeobject.o \
561+
Objects/unicode_codecs.o \
562+
Objects/unicode_codecs_utf.o \
563+
Objects/unicode_codecs_win.o \
564+
Objects/unicode_convert.o \
565+
Objects/unicode_format.o \
566+
Objects/unicode_fromformat.o \
567+
Objects/unicode_iter.o \
568+
Objects/unicode_methods.o \
569+
Objects/unicode_module.o \
570+
Objects/unicode_writer.o \
562571
Objects/unicodectype.o \
572+
Objects/unicodeobject.o \
563573
Objects/unionobject.o \
564574
Objects/weakrefobject.o \
565575
@PERF_TRAMPOLINE_OBJ@
@@ -2105,6 +2115,16 @@ Objects/bytes_methods.o: $(srcdir)/Objects/bytes_methods.c $(BYTESTR_DEPS)
21052115
Objects/bytesobject.o: $(srcdir)/Objects/bytesobject.c $(BYTESTR_DEPS)
21062116
Objects/bytearrayobject.o: $(srcdir)/Objects/bytearrayobject.c $(BYTESTR_DEPS)
21072117

2118+
Objects/unicode_codecs.o: $(srcdir)/Objects/unicode_codecs.c $(UNICODE_DEPS)
2119+
Objects/unicode_codecs_utf.o: $(srcdir)/Objects/unicode_codecs_utf.c $(UNICODE_DEPS)
2120+
Objects/unicode_codecs_win.o: $(srcdir)/Objects/unicode_codecs_win.c $(UNICODE_DEPS)
2121+
Objects/unicode_convert.o: $(srcdir)/Objects/unicode_convert.c $(UNICODE_DEPS)
2122+
Objects/unicode_format.o: $(srcdir)/Objects/unicode_format.c $(UNICODE_DEPS)
2123+
Objects/unicode_fromformat.o: $(srcdir)/Objects/unicode_fromformat.c $(UNICODE_DEPS)
2124+
Objects/unicode_iter.o: $(srcdir)/Objects/unicode_iter.c $(UNICODE_DEPS)
2125+
Objects/unicode_methods.o: $(srcdir)/Objects/unicode_methods.c $(UNICODE_DEPS)
2126+
Objects/unicode_module.o: $(srcdir)/Objects/unicode_module.c $(UNICODE_DEPS)
2127+
Objects/unicode_writer.o: $(srcdir)/Objects/unicode_writer.c $(UNICODE_DEPS)
21082128
Objects/unicodeobject.o: $(srcdir)/Objects/unicodeobject.c $(UNICODE_DEPS)
21092129

21102130
Objects/dictobject.o: $(srcdir)/Objects/stringlib/eq.h

Objects/clinic/unicode_codecs.c.h

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Objects/clinic/unicodeobject.c.h

Lines changed: 1 addition & 19 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)