@@ -4979,39 +4979,228 @@ PyUnicode_DecodeUTF8(const char *s,
49794979#include "stringlib/codecs.h"
49804980#include "stringlib/undef.h"
49814981
4982+ #if (SIZEOF_SIZE_T == 8 )
49824983/* Mask to quickly check whether a C 'size_t' contains a
49834984 non-ASCII, UTF8-encoded char. */
4984- #if (SIZEOF_SIZE_T == 8 )
49854985# define ASCII_CHAR_MASK 0x8080808080808080ULL
4986+ // used to count codepoints in UTF-8 string.
4987+ # define VECTOR_0101 0x0101010101010101ULL
4988+ # define VECTOR_00FF 0x00ff00ff00ff00ffULL
49864989#elif (SIZEOF_SIZE_T == 4 )
49874990# define ASCII_CHAR_MASK 0x80808080U
4991+ # define VECTOR_0101 0x01010101U
4992+ # define VECTOR_00FF 0x00ff00ffU
49884993#else
49894994# error C 'size_t' size should be either 4 or 8!
49904995#endif
49914996
4997+ #if (defined(__clang__ ) || defined(__GNUC__ ))
4998+ #define HAVE_CTZ 1
4999+ static inline unsigned int
5000+ ctz (size_t v )
5001+ {
5002+ return __builtin_ctzll ((unsigned long long )v );
5003+ }
5004+ #elif defined(_MSC_VER )
5005+ #define HAVE_CTZ 1
5006+ static inline unsigned int
5007+ ctz (size_t v )
5008+ {
5009+ unsigned long pos ;
5010+ #if SIZEOF_SIZE_T == 4
5011+ _BitScanForward (& pos , v );
5012+ #else
5013+ _BitScanForward64 (& pos , v );
5014+ #endif /* SIZEOF_SIZE_T */
5015+ return pos ;
5016+ }
5017+ #endif
5018+
5019+ #if HAVE_CTZ
5020+ // load p[0]..p[size-1] as a little-endian size_t
5021+ // without unaligned access nor read ahead.
5022+ static size_t
5023+ load_unaligned (const unsigned char * p , size_t size )
5024+ {
5025+ assert (size <= SIZEOF_SIZE_T );
5026+ union {
5027+ size_t s ;
5028+ unsigned char b [SIZEOF_SIZE_T ];
5029+ } u ;
5030+ u .s = 0 ;
5031+ switch (size ) {
5032+ case 8 :
5033+ u .b [7 ] = p [7 ];
5034+ _Py_FALLTHROUGH ;
5035+ case 7 :
5036+ u .b [6 ] = p [6 ];
5037+ _Py_FALLTHROUGH ;
5038+ case 6 :
5039+ u .b [5 ] = p [5 ];
5040+ _Py_FALLTHROUGH ;
5041+ case 5 :
5042+ u .b [4 ] = p [4 ];
5043+ _Py_FALLTHROUGH ;
5044+ case 4 :
5045+ u .b [3 ] = p [3 ];
5046+ _Py_FALLTHROUGH ;
5047+ case 3 :
5048+ u .b [2 ] = p [2 ];
5049+ _Py_FALLTHROUGH ;
5050+ case 2 :
5051+ u .b [1 ] = p [1 ];
5052+ _Py_FALLTHROUGH ;
5053+ case 1 :
5054+ u .b [0 ] = p [0 ];
5055+ break ;
5056+ case 0 :
5057+ break ;
5058+ default :
5059+ Py_UNREACHABLE ();
5060+ }
5061+ return u .s ;
5062+ }
5063+ #endif
5064+
5065+ /*
5066+ * Find the first non-ASCII character in a byte sequence.
5067+ *
5068+ * This function scans a range of bytes from `start` to `end` and returns the
5069+ * index of the first byte that is not an ASCII character (i.e., has the most
5070+ * significant bit set). If all characters in the range are ASCII, it returns
5071+ * `end - start`.
5072+ */
49925073static Py_ssize_t
4993- ascii_decode (const char * start , const char * end , Py_UCS1 * dest )
5074+ find_first_nonascii (const unsigned char * start , const unsigned char * end )
49945075{
4995- const char * p = start ;
5076+ const unsigned char * p = start ;
49965077
5078+ if (end - start >= SIZEOF_SIZE_T ) {
5079+ const unsigned char * p2 = _Py_ALIGN_UP (p , SIZEOF_SIZE_T );
5080+ if (p < p2 ) {
5081+ #if HAVE_CTZ
5082+ #if defined(_M_AMD64 ) || defined(_M_IX86 ) || defined(__x86_64__ ) || defined(__i386__ )
5083+ // x86 and amd64 are little endian and can load unaligned memory.
5084+ size_t u = * (const size_t * )p & ASCII_CHAR_MASK ;
5085+ #else
5086+ size_t u = load_unaligned (p , p2 - p ) & ASCII_CHAR_MASK ;
5087+ #endif
5088+ if (u ) {
5089+ return p - start + (ctz (u ) - 7 ) / 8 ;
5090+ }
5091+ p = p2 ;
5092+ }
5093+ #else
5094+ while (p < p2 ) {
5095+ if (* p & 0x80 ) {
5096+ return p - start ;
5097+ }
5098+ p ++ ;
5099+ }
5100+ #endif
5101+ const unsigned char * e = end - SIZEOF_SIZE_T ;
5102+ while (p <= e ) {
5103+ size_t u = (* (const size_t * )p ) & ASCII_CHAR_MASK ;
5104+ if (u ) {
5105+ #if PY_LITTLE_ENDIAN && HAVE_CTZ
5106+ return p - start + (ctz (u ) - 7 ) / 8 ;
5107+ #else
5108+ // big endian and minor compilers are difficult to test.
5109+ // fallback to per byte check.
5110+ break ;
5111+ #endif
5112+ }
5113+ p += SIZEOF_SIZE_T ;
5114+ }
5115+ }
5116+ #if HAVE_CTZ
5117+ // we can not use *(const size_t*)p to avoid buffer overrun.
5118+ size_t u = load_unaligned (p , end - p ) & ASCII_CHAR_MASK ;
5119+ if (u ) {
5120+ return p - start + (ctz (u ) - 7 ) / 8 ;
5121+ }
5122+ return end - start ;
5123+ #else
5124+ while (p < end ) {
5125+ if (* p & 0x80 ) {
5126+ break ;
5127+ }
5128+ p ++ ;
5129+ }
5130+ return p - start ;
5131+ #endif
5132+ }
5133+
5134+ static inline int
5135+ scalar_utf8_start_char (unsigned int ch )
5136+ {
5137+ // 0xxxxxxx or 11xxxxxx are first byte.
5138+ return (~ch >> 7 | ch >> 6 ) & 1 ;
5139+ }
5140+
5141+ static inline size_t
5142+ vector_utf8_start_chars (size_t v )
5143+ {
5144+ return ((~v >> 7 ) | (v >> 6 )) & VECTOR_0101 ;
5145+ }
5146+
5147+
5148+ // Count the number of UTF-8 code points in a given byte sequence.
5149+ static Py_ssize_t
5150+ utf8_count_codepoints (const unsigned char * s , const unsigned char * end )
5151+ {
5152+ Py_ssize_t len = 0 ;
5153+
5154+ if (end - s >= SIZEOF_SIZE_T ) {
5155+ while (!_Py_IS_ALIGNED (s , ALIGNOF_SIZE_T )) {
5156+ len += scalar_utf8_start_char (* s ++ );
5157+ }
5158+
5159+ while (s + SIZEOF_SIZE_T <= end ) {
5160+ const unsigned char * e = end ;
5161+ if (e - s > SIZEOF_SIZE_T * 255 ) {
5162+ e = s + SIZEOF_SIZE_T * 255 ;
5163+ }
5164+ Py_ssize_t vstart = 0 ;
5165+ while (s + SIZEOF_SIZE_T <= e ) {
5166+ size_t v = * (size_t * )s ;
5167+ size_t vs = vector_utf8_start_chars (v );
5168+ vstart += vs ;
5169+ s += SIZEOF_SIZE_T ;
5170+ }
5171+ vstart = (vstart & VECTOR_00FF ) + ((vstart >> 8 ) & VECTOR_00FF );
5172+ vstart += vstart >> 16 ;
5173+ #if SIZEOF_SIZE_T == 8
5174+ vstart += vstart >> 32 ;
5175+ #endif
5176+ len += vstart & 0x7ff ;
5177+ }
5178+ }
5179+ while (s < end ) {
5180+ len += scalar_utf8_start_char (* s ++ );
5181+ }
5182+ return len ;
5183+ }
5184+
5185+ static Py_ssize_t
5186+ ascii_decode (const char * start , const char * end , Py_UCS1 * dest )
5187+ {
49975188#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
4998- if (_Py_IS_ALIGNED (p , ALIGNOF_SIZE_T )
5189+ if (_Py_IS_ALIGNED (start , ALIGNOF_SIZE_T )
49995190 && _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
50005191 {
50015192 /* Fast path, see in STRINGLIB(utf8_decode) for
50025193 an explanation. */
5003- /* Help allocation */
5004- const char * _p = p ;
5005- Py_UCS1 * q = dest ;
5006- while (_p + SIZEOF_SIZE_T <= end ) {
5007- size_t value = * (const size_t * ) _p ;
5194+ const char * p = start ;
5195+ Py_UCS1 * q = dest ;
5196+ while (p + SIZEOF_SIZE_T <= end ) {
5197+ size_t value = * (const size_t * ) p ;
50085198 if (value & ASCII_CHAR_MASK )
50095199 break ;
50105200 * ((size_t * )q ) = value ;
5011- _p += SIZEOF_SIZE_T ;
5201+ p += SIZEOF_SIZE_T ;
50125202 q += SIZEOF_SIZE_T ;
50135203 }
5014- p = _p ;
50155204 while (p < end ) {
50165205 if ((unsigned char )* p & 0x80 )
50175206 break ;
@@ -5020,31 +5209,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
50205209 return p - start ;
50215210 }
50225211#endif
5023- while (p < end ) {
5024- /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5025- for an explanation. */
5026- if (_Py_IS_ALIGNED (p , ALIGNOF_SIZE_T )) {
5027- /* Help allocation */
5028- const char * _p = p ;
5029- while (_p + SIZEOF_SIZE_T <= end ) {
5030- size_t value = * (const size_t * ) _p ;
5031- if (value & ASCII_CHAR_MASK )
5032- break ;
5033- _p += SIZEOF_SIZE_T ;
5034- }
5035- p = _p ;
5036- if (_p == end )
5037- break ;
5038- }
5039- if ((unsigned char )* p & 0x80 )
5040- break ;
5041- ++ p ;
5042- }
5043- memcpy (dest , start , p - start );
5044- return p - start ;
5212+ Py_ssize_t pos = find_first_nonascii ((const unsigned char * )start ,
5213+ (const unsigned char * )end );
5214+ memcpy (dest , start , pos );
5215+ return pos ;
50455216}
50465217
5047-
50485218static int
50495219unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
50505220 const char * starts , const char * s , const char * end ,
@@ -5188,27 +5358,69 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
51885358 return get_latin1_char ((unsigned char )s [0 ]);
51895359 }
51905360
5191- // fast path: try ASCII string.
5192- const char * starts = s ;
5193- const char * end = s + size ;
5194- PyObject * u = PyUnicode_New (size , 127 );
5195- if (u == NULL ) {
5361+ // I don't know this check is necessary or not. But there is a test
5362+ // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5363+ if (PY_SSIZE_T_MAX - sizeof (PyCompactUnicodeObject ) < (size_t )size ) {
5364+ PyErr_NoMemory ();
51965365 return NULL ;
51975366 }
5198- Py_ssize_t decoded = ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
5199- if (decoded == size ) {
5367+
5368+ const char * starts = s ;
5369+ const char * end = s + size ;
5370+
5371+ Py_ssize_t pos = find_first_nonascii ((const unsigned char * )starts , (const unsigned char * )end );
5372+ if (pos == size ) { // fast path: ASCII string.
5373+ PyObject * u = PyUnicode_New (size , 127 );
5374+ if (u == NULL ) {
5375+ return NULL ;
5376+ }
5377+ memcpy (PyUnicode_1BYTE_DATA (u ), s , size );
52005378 if (consumed ) {
52015379 * consumed = size ;
52025380 }
52035381 return u ;
52045382 }
5205- s += decoded ;
5206- size -= decoded ;
5383+
5384+ int maxchr = 127 ;
5385+ Py_ssize_t maxsize = size ;
5386+
5387+ unsigned char ch = (unsigned char )(s [pos ]);
5388+ // error handler other than strict may remove/replace the invalid byte.
5389+ // consumed != NULL allows 1~3 bytes remainings.
5390+ // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5391+ // otherwise: check the input and decide the maxchr and maxsize to reduce
5392+ // reallocation and copy.
5393+ if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2 ) {
5394+ // we only calculate the number of codepoints and don't determine the exact maxchr.
5395+ // This is because writing fast and portable SIMD code to find maxchr is difficult.
5396+ // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5397+ // means that it is no longer necessary to allocate several times the required amount
5398+ // of memory.
5399+ maxsize = utf8_count_codepoints ((const unsigned char * )s , (const unsigned char * )end );
5400+ if (ch < 0xc4 ) { // latin1
5401+ maxchr = 0xff ;
5402+ }
5403+ else if (ch < 0xf0 ) { // ucs2
5404+ maxchr = 0xffff ;
5405+ }
5406+ else { // ucs4
5407+ maxchr = 0x10ffff ;
5408+ }
5409+ }
5410+ PyObject * u = PyUnicode_New (maxsize , maxchr );
5411+ if (!u ) {
5412+ return NULL ;
5413+ }
52075414
52085415 // Use _PyUnicodeWriter after fast path is failed.
52095416 _PyUnicodeWriter writer ;
52105417 _PyUnicodeWriter_InitWithBuffer (& writer , u );
5211- writer .pos = decoded ;
5418+ if (maxchr <= 255 ) {
5419+ memcpy (PyUnicode_1BYTE_DATA (u ), s , pos );
5420+ s += pos ;
5421+ size -= pos ;
5422+ writer .pos = pos ;
5423+ }
52125424
52135425 if (unicode_decode_utf8_impl (& writer , starts , s , end ,
52145426 error_handler , errors ,
@@ -5268,7 +5480,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
52685480 const char * errors ,
52695481 Py_ssize_t * consumed )
52705482{
5271- return unicode_decode_utf8 (s , size , _Py_ERROR_UNKNOWN , errors , consumed );
5483+ return unicode_decode_utf8 (s , size ,
5484+ errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT ,
5485+ errors , consumed );
52725486}
52735487
52745488
0 commit comments