Skip to content

Commit a7a9e4e

Browse files
committed
Add API functions to create string from a valid UTF-8 string.
JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com
1 parent dc5ae46 commit a7a9e4e

File tree

6 files changed

+183
-0
lines changed

6 files changed

+183
-0
lines changed

jerry-core/ecma/base/ecma-helpers-string.c

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,111 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
215215
return string_desc_p;
216216
} /* ecma_new_ecma_string_from_utf8 */
217217

218+
/**
219+
* Allocate new ecma-string and fill it with characters from the utf8 string
220+
* It converts all 4-bytes long unicode sequence to two 3-bytes long sequence
221+
*
222+
* @return pointer to ecma-string descriptor
223+
*/
224+
ecma_string_t *
225+
ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t *string_p, /**< utf-8 string */
226+
lit_utf8_size_t string_size) /**< utf-8 string size */
227+
{
228+
JERRY_ASSERT (string_p != NULL || string_size == 0);
229+
JERRY_ASSERT (lit_is_utf8_string_valid (string_p, string_size));
230+
231+
ecma_string_t *string_desc_p;
232+
lit_utf8_byte_t *data_p;
233+
234+
ecma_length_t str_length = 0;
235+
lit_utf8_size_t conv_size = 0;
236+
lit_utf8_size_t size = 0;
237+
238+
/* Calculate the required length and size information of the converted cesu-8 encoded string */
239+
while (size < string_size)
240+
{
241+
if ((string_p[size] & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
242+
{
243+
size++;
244+
}
245+
else if ((string_p[size] & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
246+
{
247+
size += 2;
248+
}
249+
else if ((string_p[size] & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
250+
{
251+
size += 3;
252+
}
253+
else
254+
{
255+
JERRY_ASSERT ((string_p[size] & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
256+
size += 4;
257+
conv_size += 2;
258+
}
259+
260+
str_length++;
261+
}
262+
263+
JERRY_ASSERT (size == string_size);
264+
265+
conv_size += size;
266+
267+
if (likely (string_size <= UINT16_MAX))
268+
{
269+
string_desc_p = jmem_heap_alloc_block (sizeof (ecma_string_t) + conv_size);
270+
271+
string_desc_p->refs_and_container = ECMA_STRING_CONTAINER_HEAP_UTF8_STRING | ECMA_STRING_REF_ONE;
272+
string_desc_p->u.common_field = 0;
273+
string_desc_p->u.utf8_string.size = (uint16_t) conv_size;
274+
string_desc_p->u.utf8_string.length = (uint16_t) str_length;
275+
276+
data_p = (lit_utf8_byte_t *) (string_desc_p + 1);
277+
}
278+
else
279+
{
280+
string_desc_p = jmem_heap_alloc_block (sizeof (ecma_long_string_t) + conv_size);
281+
282+
string_desc_p->refs_and_container = ECMA_STRING_CONTAINER_HEAP_LONG_UTF8_STRING | ECMA_STRING_REF_ONE;
283+
string_desc_p->u.common_field = 0;
284+
string_desc_p->u.long_utf8_string_size = conv_size;
285+
286+
ecma_long_string_t *long_string_desc_p = (ecma_long_string_t *) string_desc_p;
287+
long_string_desc_p->long_utf8_string_length = str_length;
288+
289+
data_p = (lit_utf8_byte_t *) (long_string_desc_p + 1);
290+
}
291+
292+
size = 0;
293+
294+
while (size < string_size)
295+
{
296+
if ((string_p[size] & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
297+
{
298+
/* Processing 4 byte unicode sequence. Always converted to two 3 byte long sequence. */
299+
uint32_t character = ((((uint32_t) string_p[size++]) & 0x7) << 18);
300+
character |= ((((uint32_t) string_p[size++]) & LIT_UTF8_LAST_6_BITS_MASK) << 12);
301+
character |= ((((uint32_t) string_p[size++]) & LIT_UTF8_LAST_6_BITS_MASK) << 6);
302+
character |= (((uint32_t) string_p[size++]) & LIT_UTF8_LAST_6_BITS_MASK);
303+
304+
JERRY_ASSERT (character >= 0x10000);
305+
character -= 0x10000;
306+
307+
data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xd800 | (character >> 10)));
308+
data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xdc00 | (character & LIT_UTF16_LAST_10_BITS_MASK)));
309+
}
310+
else
311+
{
312+
*data_p++ = string_p[size++];
313+
}
314+
}
315+
316+
JERRY_ASSERT (size == string_size);
317+
318+
string_desc_p->hash = lit_utf8_string_calc_hash (data_p, conv_size);
319+
320+
return string_desc_p;
321+
} /* ecma_new_ecma_string_from_utf8_converted_to_cesu8 */
322+
218323
/**
219324
* Allocate new ecma-string and fill it with cesu-8 character which represents specified code unit
220325
*

jerry-core/ecma/base/ecma-helpers.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ extern void ecma_free_value_if_not_object (ecma_value_t);
164164

165165
/* ecma-helpers-string.c */
166166
extern ecma_string_t *ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *, lit_utf8_size_t);
167+
extern ecma_string_t *ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t *, lit_utf8_size_t);
167168
extern ecma_string_t *ecma_new_ecma_string_from_code_unit (ecma_char_t);
168169
extern ecma_string_t *ecma_new_ecma_string_from_uint32 (uint32_t);
169170
extern ecma_string_t *ecma_new_ecma_string_from_number (ecma_number_t);

jerry-core/jerry-api.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,8 @@ jerry_value_t jerry_create_number_infinity (bool);
248248
jerry_value_t jerry_create_number_nan (void);
249249
jerry_value_t jerry_create_null (void);
250250
jerry_value_t jerry_create_object (void);
251+
jerry_value_t jerry_create_string_from_utf8 (const jerry_char_t *);
252+
jerry_value_t jerry_create_string_sz_from_utf8 (const jerry_char_t *, jerry_size_t);
251253
jerry_value_t jerry_create_string (const jerry_char_t *);
252254
jerry_value_t jerry_create_string_sz (const jerry_char_t *, jerry_size_t);
253255
jerry_value_t jerry_create_undefined (void);

jerry-core/jerry.c

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,49 @@ jerry_create_object (void)
913913
return ecma_make_object_value (ecma_op_create_object_object_noarg ());
914914
} /* jerry_create_object */
915915

916+
/**
917+
* Create string from a valid UTF8 string
918+
*
919+
* Note:
920+
* returned value must be freed with jerry_release_value when it is no longer needed.
921+
*
922+
* @return value of the created string
923+
*/
924+
jerry_value_t
925+
jerry_create_string_from_utf8 (const jerry_char_t *str_p) /**< pointer to string */
926+
{
927+
return jerry_create_string_sz_from_utf8 (str_p, lit_zt_utf8_string_size ((lit_utf8_byte_t *) str_p));
928+
} /* jerry_create_string_from_utf8 */
929+
930+
/**
931+
* Create string from a valid UTF8 string
932+
*
933+
* Note:
934+
* returned value must be freed with jerry_release_value when it is no longer needed.
935+
*
936+
* @return value of the created string
937+
*/
938+
jerry_value_t
939+
jerry_create_string_sz_from_utf8 (const jerry_char_t *str_p, /**< pointer to string */
940+
jerry_size_t str_size) /**< string size */
941+
{
942+
jerry_assert_api_available ();
943+
944+
ecma_string_t *ecma_str_p = NULL;
945+
946+
if (lit_utf8_contains_four_bytes_unicode ((lit_utf8_byte_t *) str_p, (lit_utf8_size_t) str_size))
947+
{
948+
ecma_str_p = ecma_new_ecma_string_from_utf8_converted_to_cesu8 ((lit_utf8_byte_t *) str_p,
949+
(lit_utf8_size_t) str_size);
950+
}
951+
else
952+
{
953+
ecma_str_p = ecma_new_ecma_string_from_utf8 ((lit_utf8_byte_t *) str_p, (lit_utf8_size_t) str_size);
954+
}
955+
956+
return ecma_make_string_value (ecma_str_p);
957+
} /* jerry_create_string_sz_from_utf8 */
958+
916959
/**
917960
* Create string from a valid CESU8 string
918961
*

jerry-core/lit/lit-strings.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,3 +797,33 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<
797797

798798
return (string1_pos >= string1_end_p && string2_pos < string2_end_p);
799799
} /* lit_compare_utf8_strings_relational */
800+
801+
/**
802+
* Check whether the utf-8 string contains 4-bytes long unicode or not.
803+
*
804+
* @return true if utf-8 string contains 4-bytes long unicode
805+
* false otherwise
806+
*/
807+
bool
808+
lit_utf8_contains_four_bytes_unicode (const lit_utf8_byte_t *buf_p, lit_utf8_size_t buf_size)
809+
{
810+
JERRY_ASSERT (buf_p != NULL || buf_size == 0);
811+
812+
lit_utf8_size_t size = 0;
813+
814+
while (size < buf_size)
815+
{
816+
if ((buf_p[size] & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
817+
{
818+
return true;
819+
}
820+
else
821+
{
822+
size++;
823+
}
824+
}
825+
826+
JERRY_ASSERT (size == buf_size);
827+
828+
return false;
829+
} /* lit_utf8_contains_four_bytes_unicode */

jerry-core/lit/lit-strings.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,4 +135,6 @@ ecma_char_t lit_utf8_peek_prev (const lit_utf8_byte_t *);
135135
void lit_utf8_incr (const lit_utf8_byte_t **);
136136
void lit_utf8_decr (const lit_utf8_byte_t **);
137137

138+
bool lit_utf8_contains_four_bytes_unicode (const lit_utf8_byte_t *, lit_utf8_size_t);
139+
138140
#endif /* !LIT_STRINGS_H */

0 commit comments

Comments
 (0)