@@ -4978,12 +4978,17 @@ PyUnicode_DecodeUTF8(const char *s,
49784978#include "stringlib/codecs.h"
49794979#include "stringlib/undef.h"
49804980
4981+ #if (SIZEOF_SIZE_T == 8 )
49814982/* Mask to quickly check whether a C 'size_t' contains a
49824983 non-ASCII, UTF8-encoded char. */
4983- #if (SIZEOF_SIZE_T == 8 )
49844984# define ASCII_CHAR_MASK 0x8080808080808080ULL
4985+ // used to count codepoints in UTF-8 string.
4986+ # define VECTOR_0101 0x0101010101010101ULL
4987+ # define VECTOR_00FF 0x00ff00ff00ff00ffULL
49854988#elif (SIZEOF_SIZE_T == 4 )
49864989# define ASCII_CHAR_MASK 0x80808080U
4990+ # define VECTOR_0101 0x01010101U
4991+ # define VECTOR_00FF 0x00ff00ffU
49874992#else
49884993# error C 'size_t' size should be either 4 or 8!
49894994#endif
@@ -5087,6 +5092,61 @@ find_first_nonascii(const char *start, const char *end)
50875092}
50885093
50895094
5095+ #if SIZEOF_SIZE_T == 4
5096+ const size_t vector_01 = 0x01010101 ;
5097+ const size_t vector_00ff = 0x00ff00ff ;
5098+ #else
5099+ const size_t vector_01 = 0x0101010101010101 ;
5100+ const size_t vector_00ff = 0x00ff00ff00ff00ff ;
5101+ #endif
5102+
5103+ static inline int scalar_utf8_start_char (unsigned int ch )
5104+ {
5105+ // 0xxxxxxx or 11xxxxxx are first byte.
5106+ return (~ch >> 7 | ch >> 6 ) & 1 ;
5107+ }
5108+
5109+ static inline size_t vector_utf8_start_chars (size_t v )
5110+ {
5111+ return ((~v >>7 ) | (v >>6 )) & VECTOR_0101 ;
5112+ }
5113+
5114+ static Py_ssize_t utf8_count (const unsigned char * s , Py_ssize_t size )
5115+ {
5116+ Py_ssize_t len = 0 ;
5117+ const unsigned char * end = s + size ;
5118+
5119+ if (end - s > SIZEOF_SIZE_T * 2 ) {
5120+ while (!_Py_IS_ALIGNED (s , ALIGNOF_SIZE_T )) {
5121+ len += scalar_utf8_start_char (* s ++ );
5122+ }
5123+
5124+ while (s + SIZEOF_SIZE_T <= end ) {
5125+ const unsigned char * e = end ;
5126+ if (e - s > SIZEOF_SIZE_T * 255 ) {
5127+ e = s + SIZEOF_SIZE_T * 255 ;
5128+ }
5129+ Py_ssize_t vstart = 0 ;
5130+ while (s + SIZEOF_SIZE_T <= e ) {
5131+ size_t v = * (size_t * )s ;
5132+ size_t vs = vector_utf8_start_chars (v );
5133+ vstart += vs ;
5134+ s += SIZEOF_SIZE_T ;
5135+ }
5136+ vstart = (vstart & VECTOR_00FF ) + ((vstart >> 8 ) & VECTOR_00FF );
5137+ vstart += vstart >> 16 ;
5138+ #if SIZEOF_SIZE_T == 8
5139+ vstart += vstart >> 32 ;
5140+ #endif
5141+ len += vstart & 0x7ff ;
5142+ }
5143+ }
5144+ while (s < end ) {
5145+ len += scalar_utf8_start_char (* s ++ );
5146+ }
5147+ return len ;
5148+ }
5149+
50905150static int
50915151unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
50925152 const char * starts , const char * s , const char * end ,
@@ -5234,8 +5294,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
52345294 const char * end = s + size ;
52355295
52365296 Py_ssize_t pos = find_first_nonascii (starts , end );
5237- if (pos == size ) {
5238- // fast path: ASCII
5297+ if (pos == size ) { // fast path: ASCII string.
52395298 PyObject * u = PyUnicode_New (size , 127 );
52405299 if (u == NULL ) {
52415300 return NULL ;
@@ -5248,8 +5307,11 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
52485307 }
52495308
52505309 int maxchr = 127 ;
5310+ Py_ssize_t maxsize = size ;
5311+
52515312 unsigned char ch = (unsigned char )s [pos ];
52525313 if (error_handler == _Py_ERROR_STRICT && ch >= 0xc2 ) {
5314+ maxsize = utf8_count ((const unsigned char * )s , size );
52535315 if (ch < 0xc4 ) { // latin1
52545316 maxchr = 255 ;
52555317 }
@@ -5260,7 +5322,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
52605322 maxchr = 0x10ffff ;
52615323 }
52625324 }
5263- PyObject * u = PyUnicode_New (size , maxchr );
5325+ PyObject * u = PyUnicode_New (maxsize , maxchr );
52645326 if (!u ) {
52655327 return NULL ;
52665328 }
0 commit comments