@@ -636,28 +636,72 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st
636636 return build_string (string , stringEnd , intern , symbolize );
637637}
638638
639+ typedef struct _json_unescape_cursor {
640+ const char * p ;
641+ const char * pe ;
642+ char * buffer ;
643+ } JSON_UnescapeCursor ;
644+
645+ static ALWAYS_INLINE () int json_copy_and_find_next_backslash (JSON_UnescapeCursor * cursor , const char * stringEnd )
646+ {
647+ if (cursor -> pe >= stringEnd ) {
648+ return 0 ;
649+ }
650+
651+ const char * p = cursor -> p ;
652+
653+ #ifdef HAVE_SIMD_NEON
654+ while (p + sizeof (uint8x16_t ) <= stringEnd ) {
655+ uint8x16_t chunk = vld1q_u8 ((const unsigned char * )p );
656+ vst1q_u8 ((unsigned char * )cursor -> buffer , chunk );
657+ uint8x16_t has_backslash = vceqq_u8 (chunk , vdupq_n_u8 ('\\' ));
658+ uint64_t mask = neon_match_mask (has_backslash );
659+ if (mask ) {
660+ uint32_t index = trailing_zeros64 (mask ) >> 2 ;
661+ cursor -> buffer += index ;
662+ cursor -> p = p + index ;
663+ cursor -> pe = p + index ;
664+ return 1 ;
665+ }
666+ p += sizeof (uint8x16_t );
667+ cursor -> buffer += sizeof (uint8x16_t );
668+ cursor -> p = p ;
669+ cursor -> pe = p ;
670+ }
671+ #endif
672+
673+ cursor -> pe = memchr (p , '\\' , stringEnd - p );
674+ if (cursor -> pe ) {
675+ if (cursor -> pe > p ) {
676+ MEMCPY (cursor -> buffer , p , char , cursor -> pe - p );
677+ cursor -> buffer += cursor -> pe - p ;
678+ cursor -> p = cursor -> pe ;
679+ }
680+ return 1 ;
681+ }
682+ return 0 ;
683+ }
684+
639685static VALUE json_string_unescape (JSON_ParserState * state , const char * string , const char * stringEnd , bool is_name , bool intern , bool symbolize )
640686{
641687 size_t bufferSize = stringEnd - string ;
642- const char * p = string , * pe = string , * bufferStart ;
643- char * buffer ;
644-
645688 VALUE result = rb_str_buf_new (bufferSize );
646689 rb_enc_associate_index (result , utf8_encindex );
647- buffer = RSTRING_PTR (result );
648- bufferStart = buffer ;
690+ const char * bufferStart = RSTRING_PTR (result );
649691
650- #define APPEND_CHAR (chr ) *buffer++ = chr; p = ++pe;
692+ JSON_UnescapeCursor cursor = {
693+ .p = string ,
694+ .pe = string ,
695+ .buffer = (char * )bufferStart ,
696+ };
651697
652- while (pe < stringEnd && (pe = memchr (pe , '\\' , stringEnd - pe ))) {
653- if (pe > p ) {
654- MEMCPY (buffer , p , char , pe - p );
655- buffer += pe - p ;
656- }
657- switch (* ++ pe ) {
698+ #define APPEND_CHAR (chr ) *cursor.buffer++ = chr; cursor.p = ++cursor.pe;
699+
700+ while (json_copy_and_find_next_backslash (& cursor , stringEnd )) {
701+ switch (* ++ cursor .pe ) {
658702 case '"' :
659703 case '/' :
660- p = pe ; // nothing to unescape just need to skip the backslash
704+ cursor . p = cursor . pe ; // nothing to unescape just need to skip the backslash
661705 break ;
662706 case '\\' :
663707 APPEND_CHAR ('\\' );
@@ -678,11 +722,11 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
678722 APPEND_CHAR ('\f' );
679723 break ;
680724 case 'u' :
681- if (pe > stringEnd - 5 ) {
682- raise_parse_error_at ("incomplete unicode character escape sequence at %s" , state , p );
725+ if (cursor . pe > stringEnd - 5 ) {
726+ raise_parse_error_at ("incomplete unicode character escape sequence at %s" , state , cursor . p );
683727 } else {
684- uint32_t ch = unescape_unicode (state , (unsigned char * ) ++ pe );
685- pe += 3 ;
728+ uint32_t ch = unescape_unicode (state , (unsigned char * ) ++ cursor . pe );
729+ cursor . pe += 3 ;
686730 /* To handle values above U+FFFF, we take a sequence of
687731 * \uXXXX escapes in the U+D800..U+DBFF then
688732 * U+DC00..U+DFFF ranges, take the low 10 bits from each
@@ -694,48 +738,48 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
694738 * Area".
695739 */
696740 if ((ch & 0xFC00 ) == 0xD800 ) {
697- pe ++ ;
698- if (pe > stringEnd - 6 ) {
699- raise_parse_error_at ("incomplete surrogate pair at %s" , state , p );
741+ cursor . pe ++ ;
742+ if (cursor . pe > stringEnd - 6 ) {
743+ raise_parse_error_at ("incomplete surrogate pair at %s" , state , cursor . p );
700744 }
701- if (pe [0 ] == '\\' && pe [1 ] == 'u' ) {
702- uint32_t sur = unescape_unicode (state , (unsigned char * ) pe + 2 );
745+ if (cursor . pe [0 ] == '\\' && cursor . pe [1 ] == 'u' ) {
746+ uint32_t sur = unescape_unicode (state , (unsigned char * ) cursor . pe + 2 );
703747
704748 if ((sur & 0xFC00 ) != 0xDC00 ) {
705- raise_parse_error_at ("invalid surrogate pair at %s" , state , p );
749+ raise_parse_error_at ("invalid surrogate pair at %s" , state , cursor . p );
706750 }
707751
708752 ch = (((ch & 0x3F ) << 10 ) | ((((ch >> 6 ) & 0xF ) + 1 ) << 16 )
709753 | (sur & 0x3FF ));
710- pe += 5 ;
754+ cursor . pe += 5 ;
711755 } else {
712- raise_parse_error_at ("incomplete surrogate pair at %s" , state , p );
756+ raise_parse_error_at ("incomplete surrogate pair at %s" , state , cursor . p );
713757 break ;
714758 }
715759 }
716760
717761 char buf [4 ];
718762 int unescape_len = convert_UTF32_to_UTF8 (buf , ch );
719- MEMCPY (buffer , buf , char , unescape_len );
720- buffer += unescape_len ;
721- p = ++ pe ;
763+ MEMCPY (cursor . buffer , buf , char , unescape_len );
764+ cursor . buffer += unescape_len ;
765+ cursor . p = ++ cursor . pe ;
722766 }
723767 break ;
724768 default :
725- if ((unsigned char )* pe < 0x20 ) {
726- raise_parse_error_at ("invalid ASCII control character in string: %s" , state , pe - 1 );
769+ if ((unsigned char )* cursor . pe < 0x20 ) {
770+ raise_parse_error_at ("invalid ASCII control character in string: %s" , state , cursor . pe - 1 );
727771 }
728- raise_parse_error_at ("invalid escape character in string: %s" , state , pe - 1 );
772+ raise_parse_error_at ("invalid escape character in string: %s" , state , cursor . pe - 1 );
729773 break ;
730774 }
731775 }
732776#undef APPEND_CHAR
733777
734- if (stringEnd > p ) {
735- MEMCPY (buffer , p , char , stringEnd - p );
736- buffer += stringEnd - p ;
778+ if (stringEnd > cursor . p ) {
779+ MEMCPY (cursor . buffer , cursor . p , char , stringEnd - cursor . p );
780+ cursor . buffer += stringEnd - cursor . p ;
737781 }
738- rb_str_set_len (result , buffer - bufferStart );
782+ rb_str_set_len (result , cursor . buffer - bufferStart );
739783
740784 if (symbolize ) {
741785 result = rb_str_intern (result );
0 commit comments