@@ -5043,6 +5043,49 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
50435043 return p - start ;
50445044}
50455045
5046+ static Py_ssize_t
5047+ find_first_nonascii (const char * start , const char * end )
5048+ {
5049+ const char * p = start ;
5050+
5051+ while (p < end ) {
5052+ /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5053+ for an explanation. */
5054+ if (_Py_IS_ALIGNED (p , ALIGNOF_SIZE_T )) {
5055+ const char * e = end - SIZEOF_SIZE_T ;
5056+ while (p <= e ) {
5057+ size_t value = (* (const size_t * )p ) & ASCII_CHAR_MASK ;
5058+ if (value ) {
5059+ // Optimization only for major platforms we have CI.
5060+ #if PY_LITTLE_ENDIAN && (defined(__clang__ ) || defined(__GNUC__ ))
5061+ #if SIZEOF_SIZE_T == SIZEOF_LONG
5062+ return p - start + (__builtin_ctzl (value )- 7 ) / 8 ;
5063+ #elif SIZEOF_SIZE_T == SIZEOF_LONG_LONG
5064+ return p - start + (__builtin_ctzll (value )- 7 ) / 8 ;
5065+ #endif
5066+ #elif PY_LITTLE_ENDIAN && defined(_MSC_VER )
5067+ unsigned long bitpos ;
5068+ #if SIZEOF_SIZE_T == 4
5069+ _BitScanForward (& bitpos , value );
5070+ #else
5071+ _BitScanForward64 (& bitpos , value );
5072+ #endif
5073+ return p - start + (bitpos - 7 ) / 8 ;
5074+ #endif
5075+ break ;
5076+ }
5077+ p += SIZEOF_SIZE_T ;
5078+ }
5079+ if (p == end )
5080+ break ;
5081+ }
5082+ if ((unsigned char )* p & 0x80 )
5083+ break ;
5084+ ++ p ;
5085+ }
5086+ return p - start ;
5087+ }
5088+
50465089
50475090static int
50485091unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
@@ -5187,27 +5230,50 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
51875230 return get_latin1_char ((unsigned char )s [0 ]);
51885231 }
51895232
5190- // fast path: try ASCII string.
51915233 const char * starts = s ;
51925234 const char * end = s + size ;
5193- PyObject * u = PyUnicode_New (size , 127 );
5194- if (u == NULL ) {
5195- return NULL ;
5196- }
5197- Py_ssize_t decoded = ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
5198- if (decoded == size ) {
5235+
5236+ Py_ssize_t pos = find_first_nonascii (starts , end );
5237+ if (pos == size ) {
5238+ // fast path: ASCII
5239+ PyObject * u = PyUnicode_New (size , 127 );
5240+ if (u == NULL ) {
5241+ return NULL ;
5242+ }
5243+ memcpy (PyUnicode_1BYTE_DATA (u ), s , size );
51995244 if (consumed ) {
52005245 * consumed = size ;
52015246 }
52025247 return u ;
52035248 }
5204- s += decoded ;
5205- size -= decoded ;
5249+
5250+ int maxchr = 127 ;
5251+ unsigned char ch = (unsigned char )s [pos ];
5252+ if (error_handler == _Py_ERROR_STRICT && ch >= 0xc2 ) {
5253+ if (ch < 0xc4 ) { // latin1
5254+ maxchr = 255 ;
5255+ }
5256+ else if (ch < 0xf0 ) { // ucs2
5257+ maxchr = 65535 ;
5258+ }
5259+ else { // ucs4
5260+ maxchr = 0x10ffff ;
5261+ }
5262+ }
5263+ PyObject * u = PyUnicode_New (size , maxchr );
5264+ if (!u ) {
5265+ return NULL ;
5266+ }
52065267
52075268 // Use _PyUnicodeWriter after fast path is failed.
52085269 _PyUnicodeWriter writer ;
52095270 _PyUnicodeWriter_InitWithBuffer (& writer , u );
5210- writer .pos = decoded ;
5271+ if (maxchr <= 255 ) {
5272+ memcpy (PyUnicode_1BYTE_DATA (u ), s , pos );
5273+ s += pos ;
5274+ size -= pos ;
5275+ writer .pos = pos ;
5276+ }
52115277
52125278 if (unicode_decode_utf8_impl (& writer , starts , s , end ,
52135279 error_handler , errors ,
@@ -5267,7 +5333,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
52675333 const char * errors ,
52685334 Py_ssize_t * consumed )
52695335{
5270- return unicode_decode_utf8 (s , size , _Py_ERROR_UNKNOWN , errors , consumed );
5336+ return unicode_decode_utf8 (s , size ,
5337+ errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT ,
5338+ errors , consumed );
52715339}
52725340
52735341
0 commit comments