@@ -143,6 +143,11 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
143143 {
144144 break ;
145145 }
146+ case ' b' :
147+ {
148+ *current_p = ' \b ' ;
149+ break ;
150+ }
146151 case ' f' :
147152 {
148153 *current_p = ' \f ' ;
@@ -163,10 +168,19 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
163168 *current_p = ' \t ' ;
164169 break ;
165170 }
166- case ' b ' :
171+ case ' u ' :
167172 {
168- *current_p = ' \b ' ;
169- break ;
173+ lit_code_point_t code_point;
174+
175+ if (!(lit_read_code_point_from_hex (current_p + 1 , 4 , &code_point)))
176+ {
177+ return ;
178+ }
179+
180+ current_p += 5 ;
181+ write_p += lit_code_point_to_utf8 (code_point, write_p);
182+ continue ;
183+ /* FALLTHRU */
170184 }
171185 default :
172186 {
@@ -177,6 +191,57 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
177191 *write_p++ = *current_p++;
178192 }
179193
194+ /*
195+ * Post processing surrogate pairs.
196+ *
197+ * The general issue is, that surrogate fragments can come from
198+ * the original stream and can be constructed by \u sequences
199+ * as well. We need to construct code points from them.
200+ *
201+ * Example: JSON.parse ('"\\ud801\udc00"') === "\ud801\udc00"
202+ * The first \u is parsed by JSON, the second is by the lexer.
203+ *
204+ * The rewrite happens in-place, since the write pointer is always
205+ * precede the read-pointer. We also cannot create an UTF8 iterator,
206+ * because the lit_is_utf8_string_valid assertion may fail.
207+ */
208+
209+ lit_utf8_byte_t *read_p = token_p->u .string .start_p ;
210+ lit_utf8_byte_t *read_end_p = write_p;
211+ write_p = read_p;
212+
213+ while (read_p < read_end_p)
214+ {
215+ lit_code_point_t code_point;
216+ read_p += lit_read_code_point_from_utf8 (read_p,
217+ (lit_utf8_size_t ) (read_end_p - read_p),
218+ &code_point);
219+
220+ /* The lit_is_code_unit_high_surrogate expects ecma_char_t argument
221+ so code_points above maximum UTF16 code unit must not be tested. */
222+ if (read_p < read_end_p
223+ && code_point <= LIT_UTF16_CODE_UNIT_MAX
224+ && lit_is_code_unit_high_surrogate ((ecma_char_t ) code_point))
225+ {
226+ lit_code_point_t next_code_point;
227+ lit_utf8_size_t next_code_point_size = lit_read_code_point_from_utf8 (read_p,
228+ (lit_utf8_size_t ) (read_end_p - read_p),
229+ &next_code_point);
230+
231+ if (next_code_point <= LIT_UTF16_CODE_UNIT_MAX
232+ && lit_is_code_unit_low_surrogate ((ecma_char_t ) next_code_point))
233+ {
234+ code_point = lit_convert_surrogate_pair_to_code_point ((ecma_char_t ) code_point,
235+ (ecma_char_t ) next_code_point);
236+ read_p += next_code_point_size;
237+ }
238+ }
239+ write_p += lit_code_point_to_utf8 (code_point, write_p);
240+ }
241+
242+ JERRY_ASSERT (lit_is_utf8_string_valid (token_p->u .string .start_p ,
243+ (lit_utf8_size_t ) (write_p - token_p->u .string .start_p )));
244+
180245 token_p->u .string .size = (lit_utf8_size_t ) (write_p - token_p->u .string .start_p );
181246 token_p->current_p = current_p + 1 ;
182247 token_p->type = string_token;
@@ -757,17 +822,17 @@ ecma_builtin_json_parse (ecma_value_t this_arg __attr_unused___, /**< 'this' arg
757822 ret_value);
758823
759824 ecma_string_t *string_p = ecma_get_string_from_value (string);
760- ecma_length_t length = (uint32_t ) ecma_string_get_length (string_p);
761- size_t buffer_size = sizeof (lit_utf8_byte_t ) * (length + 1 );
825+ ecma_length_t string_size = (uint32_t ) ecma_string_get_size (string_p);
826+ size_t buffer_size = sizeof (lit_utf8_byte_t ) * (string_size + 1 );
762827
763828 MEM_DEFINE_LOCAL_ARRAY (str_start_p, buffer_size, lit_utf8_byte_t );
764829
765830 ecma_string_to_utf8_string (string_p, str_start_p, (ssize_t ) buffer_size);
766- str_start_p[length ] = LIT_BYTE_NULL;
831+ str_start_p[string_size ] = LIT_BYTE_NULL;
767832
768833 ecma_json_token_t token;
769834 token.current_p = str_start_p;
770- token.end_p = str_start_p + length ;
835+ token.end_p = str_start_p + string_size ;
771836
772837 ecma_value_t final_result = ecma_builtin_json_parse_value (&token);
773838
0 commit comments