Skip to content

Commit 5a71387

Browse files
committed
add find_first_nonascii
1 parent a8fa4ad commit 5a71387

File tree

1 file changed

+79
-11
lines changed

1 file changed

+79
-11
lines changed

Objects/unicodeobject.c

Lines changed: 79 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5043,6 +5043,49 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
50435043
return p - start;
50445044
}
50455045

5046+
static Py_ssize_t
5047+
find_first_nonascii(const char *start, const char *end)
5048+
{
5049+
const char *p = start;
5050+
5051+
while (p < end) {
5052+
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5053+
for an explanation. */
5054+
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5055+
const char *e = end - SIZEOF_SIZE_T;
5056+
while (p <= e) {
5057+
size_t value = (*(const size_t *)p) & ASCII_CHAR_MASK;
5058+
if (value) {
5059+
// Optimization only for major platforms we have CI.
5060+
#if PY_LITTLE_ENDIAN && (defined(__clang__) || defined(__GNUC__))
5061+
#if SIZEOF_SIZE_T == SIZEOF_LONG
5062+
return p - start + (__builtin_ctzl(value)-7) / 8;
5063+
#elif SIZEOF_SIZE_T == SIZEOF_LONG_LONG
5064+
return p - start + (__builtin_ctzll(value)-7) / 8;
5065+
#endif
5066+
#elif PY_LITTLE_ENDIAN && defined(_MSC_VER)
5067+
unsigned long bitpos;
5068+
#if SIZEOF_SIZE_T == 4
5069+
_BitScanForward(&bitpos, value);
5070+
#else
5071+
_BitScanForward64(&bitpos, value);
5072+
#endif
5073+
return p - start + (bitpos-7) / 8;
5074+
#endif
5075+
break;
5076+
}
5077+
p += SIZEOF_SIZE_T;
5078+
}
5079+
if (p == end)
5080+
break;
5081+
}
5082+
if ((unsigned char)*p & 0x80)
5083+
break;
5084+
++p;
5085+
}
5086+
return p - start;
5087+
}
5088+
50465089

50475090
static int
50485091
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
@@ -5187,27 +5230,50 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
51875230
return get_latin1_char((unsigned char)s[0]);
51885231
}
51895232

5190-
// fast path: try ASCII string.
51915233
const char *starts = s;
51925234
const char *end = s + size;
5193-
PyObject *u = PyUnicode_New(size, 127);
5194-
if (u == NULL) {
5195-
return NULL;
5196-
}
5197-
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5198-
if (decoded == size) {
5235+
5236+
Py_ssize_t pos = find_first_nonascii(starts, end);
5237+
if (pos == size) {
5238+
// fast path: ASCII
5239+
PyObject *u = PyUnicode_New(size, 127);
5240+
if (u == NULL) {
5241+
return NULL;
5242+
}
5243+
memcpy(PyUnicode_1BYTE_DATA(u), s, size);
51995244
if (consumed) {
52005245
*consumed = size;
52015246
}
52025247
return u;
52035248
}
5204-
s += decoded;
5205-
size -= decoded;
5249+
5250+
int maxchr = 127;
5251+
unsigned char ch = (unsigned char)s[pos];
5252+
if (error_handler == _Py_ERROR_STRICT && ch >= 0xc2) {
5253+
if (ch < 0xc4) { // latin1
5254+
maxchr = 255;
5255+
}
5256+
else if (ch < 0xf0) { // ucs2
5257+
maxchr = 65535;
5258+
}
5259+
else { // ucs4
5260+
maxchr = 0x10ffff;
5261+
}
5262+
}
5263+
PyObject *u = PyUnicode_New(size, maxchr);
5264+
if (!u) {
5265+
return NULL;
5266+
}
52065267

52075268
// Use _PyUnicodeWriter after fast path is failed.
52085269
_PyUnicodeWriter writer;
52095270
_PyUnicodeWriter_InitWithBuffer(&writer, u);
5210-
writer.pos = decoded;
5271+
if (maxchr <= 255) {
5272+
memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5273+
s += pos;
5274+
size -= pos;
5275+
writer.pos = pos;
5276+
}
52115277

52125278
if (unicode_decode_utf8_impl(&writer, starts, s, end,
52135279
error_handler, errors,
@@ -5267,7 +5333,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
52675333
const char *errors,
52685334
Py_ssize_t *consumed)
52695335
{
5270-
return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5336+
return unicode_decode_utf8(s, size,
5337+
errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5338+
errors, consumed);
52715339
}
52725340

52735341

0 commit comments

Comments
 (0)