Skip to content

Commit dcd610b

Browse files
committed
Change internal encoding of strings to CESU-8
JerryScript-DCO-1.0-Signed-off-by: Zsolt Borbély zsborbely.u-szeged@partner.samsung.com JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai.u-szeged@partner.samsung.com
1 parent 08c618e commit dcd610b

14 files changed

+623
-413
lines changed

jerry-core/ecma/base/ecma-helpers-string.cpp

Lines changed: 42 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
414414
lit_utf8_size_t string_size) /**< string size */
415415
{
416416
JERRY_ASSERT (string_p != NULL || string_size == 0);
417-
JERRY_ASSERT (lit_is_utf8_string_valid (string_p, string_size));
417+
JERRY_ASSERT (lit_is_cesu8_string_valid (string_p, string_size));
418418

419419
lit_magic_string_id_t magic_string_id;
420420
if (lit_is_utf8_string_magic (string_p, string_size, &magic_string_id))
@@ -444,7 +444,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
444444
} /* ecma_new_ecma_string_from_utf8 */
445445

446446
/**
447-
* Allocate new ecma-string and fill it with utf-8 character which represents specified code unit
447+
* Allocate new ecma-string and fill it with cesu-8 character which represents specified code unit
448448
*
449449
* @return pointer to ecma-string descriptor
450450
*/
@@ -627,14 +627,7 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
627627
jerry_fatal (ERR_OUT_OF_MEMORY);
628628
}
629629

630-
ecma_char_t str1_last_code_unit = ecma_string_get_char_at_pos (string1_p, ecma_string_get_length (string1_p) - 1);
631-
ecma_char_t str2_first_code_unit = ecma_string_get_char_at_pos (string2_p, 0);
632-
633-
bool is_surrogate_pair_sliced = (lit_is_code_unit_high_surrogate (str1_last_code_unit)
634-
&& lit_is_code_unit_low_surrogate (str2_first_code_unit));
635-
636-
lit_utf8_size_t buffer_size = str1_size + str2_size - (lit_utf8_size_t) (is_surrogate_pair_sliced ?
637-
LIT_UTF8_CESU8_SURROGATE_SIZE_DIF : 0);
630+
lit_utf8_size_t buffer_size = str1_size + str2_size;
638631

639632
lit_utf8_byte_t *str_p = (lit_utf8_byte_t *) mem_heap_alloc_block (buffer_size, MEM_HEAP_ALLOC_SHORT_TERM);
640633

@@ -643,23 +636,9 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
643636
bytes_copied1 = ecma_string_to_utf8_string (string1_p, str_p, (ssize_t) str1_size);
644637
JERRY_ASSERT (bytes_copied1 > 0);
645638

646-
if (!is_surrogate_pair_sliced)
647-
{
648-
bytes_copied2 = ecma_string_to_utf8_string (string2_p, str_p + str1_size, (ssize_t) str2_size);
649-
JERRY_ASSERT (bytes_copied2 > 0);
650-
}
651-
else
652-
{
653-
bytes_copied2 = ecma_string_to_utf8_string (string2_p,
654-
str_p + str1_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT + 1,
655-
(ssize_t) buffer_size - bytes_copied1
656-
+ LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
657-
JERRY_ASSERT (bytes_copied2 > 0);
639+
bytes_copied2 = ecma_string_to_utf8_string (string2_p, str_p + str1_size, (ssize_t) str2_size);
640+
JERRY_ASSERT (bytes_copied2 > 0);
658641

659-
lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (str1_last_code_unit,
660-
str2_first_code_unit);
661-
lit_code_point_to_utf8 (surrogate_code_point, str_p + str1_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
662-
}
663642
ecma_string_t *str_concat_p = ecma_new_ecma_string_from_utf8 (str_p, buffer_size);
664643

665644
mem_heap_free_block ((void*) str_p);
@@ -955,7 +934,7 @@ ecma_string_get_array_index (const ecma_string_t *str_p, /**< ecma-string */
955934
} /* ecma_string_is_array_index */
956935

957936
/**
958-
* Convert ecma-string's contents to a utf-8 string and put it to the buffer.
937+
* Convert ecma-string's contents to a cesu-8 string and put it to the buffer.
959938
*
960939
* @return number of bytes, actually copied to the buffer - if string's content was copied successfully;
961940
* otherwise (in case size of buffer is insufficient) - negative number, which is calculated
@@ -1018,7 +997,6 @@ ecma_string_to_utf8_string (const ecma_string_t *string_desc_p, /**< ecma-string
1018997

1019998
break;
1020999
}
1021-
10221000
case ECMA_STRING_CONTAINER_MAGIC_STRING:
10231001
{
10241002
const lit_magic_string_id_t id = string_desc_p->u.magic_string_id;
@@ -1491,7 +1469,7 @@ ecma_string_get_char_at_pos (const ecma_string_t *string_p, /**< ecma-string */
14911469
ssize_t sz = ecma_string_to_utf8_string (string_p, utf8_str_p, (ssize_t) buffer_size);
14921470
JERRY_ASSERT (sz > 0);
14931471

1494-
ch = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, index);;
1472+
ch = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, index);
14951473

14961474
MEM_FINALIZE_LOCAL_ARRAY (utf8_str_p);
14971475

@@ -1682,10 +1660,7 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
16821660
JERRY_ASSERT (end_pos <= string_length);
16831661
#endif
16841662

1685-
const ecma_length_t span = (start_pos > end_pos) ? 0 : end_pos - start_pos;
1686-
const lit_utf8_size_t utf8_str_size = LIT_UTF8_MAX_BYTES_IN_CODE_UNIT * span;
1687-
1688-
if (utf8_str_size)
1663+
if (start_pos < end_pos)
16891664
{
16901665
/**
16911666
* I. Dump original string to plain buffer
@@ -1701,20 +1676,22 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
17011676
/**
17021677
* II. Extract substring
17031678
*/
1704-
MEM_DEFINE_LOCAL_ARRAY (utf8_substr_buffer, utf8_str_size, lit_utf8_byte_t);
1679+
lit_utf8_byte_t *start_p = utf8_str_p;
1680+
end_pos -= start_pos;
17051681

1706-
lit_utf8_size_t utf8_substr_buffer_offset = 0;
1707-
for (ecma_length_t idx = 0; idx < span; idx++)
1682+
while (start_pos--)
17081683
{
1709-
ecma_char_t code_unit = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, start_pos + idx);
1684+
start_p += lit_get_unicode_char_size_by_utf8_first_byte (*start_p);
1685+
}
17101686

1711-
JERRY_ASSERT (utf8_str_size >= utf8_substr_buffer_offset + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
1712-
utf8_substr_buffer_offset += lit_code_unit_to_utf8 (code_unit, utf8_substr_buffer + utf8_substr_buffer_offset);
1687+
lit_utf8_byte_t *end_p = start_p;
1688+
while (end_pos--)
1689+
{
1690+
end_p += lit_get_unicode_char_size_by_utf8_first_byte (*end_p);
17131691
}
17141692

1715-
ecma_string_p = ecma_new_ecma_string_from_utf8 (utf8_substr_buffer, utf8_substr_buffer_offset);
1693+
ecma_string_p = ecma_new_ecma_string_from_utf8 (start_p, (lit_utf8_size_t) (end_p - start_p));
17161694

1717-
MEM_FINALIZE_LOCAL_ARRAY (utf8_substr_buffer);
17181695
MEM_FINALIZE_LOCAL_ARRAY (utf8_str_p);
17191696

17201697
return ecma_string_p;
@@ -1746,47 +1723,47 @@ ecma_string_trim (const ecma_string_t *string_p) /**< pointer to an ecma string
17461723
ssize_t sz = ecma_string_to_utf8_string (string_p, utf8_str_p, (ssize_t) buffer_size);
17471724
JERRY_ASSERT (sz >= 0);
17481725

1749-
lit_utf8_iterator_t front = lit_utf8_iterator_create (utf8_str_p, buffer_size);
1750-
1751-
lit_utf8_iterator_t back = lit_utf8_iterator_create (utf8_str_p, buffer_size);
1752-
lit_utf8_iterator_seek_eos (&back);
1753-
1754-
lit_utf8_iterator_pos_t start = lit_utf8_iterator_get_pos (&back);
1755-
lit_utf8_iterator_pos_t end = lit_utf8_iterator_get_pos (&front);
1756-
1757-
ecma_char_t current;
1726+
ecma_char_t ch;
1727+
lit_utf8_size_t read_size;
1728+
lit_utf8_byte_t *nonws_start_p = utf8_str_p + buffer_size;
1729+
lit_utf8_byte_t *current_p = utf8_str_p;
17581730

17591731
/* Trim front. */
1760-
while (!lit_utf8_iterator_is_eos (&front))
1732+
while (current_p < nonws_start_p)
17611733
{
1762-
current = lit_utf8_iterator_read_next (&front);
1763-
if (!lit_char_is_white_space (current)
1764-
&& !lit_char_is_line_terminator (current))
1734+
read_size = lit_read_code_unit_from_utf8 (current_p, &ch);
1735+
1736+
if (!lit_char_is_white_space (ch)
1737+
&& !lit_char_is_line_terminator (ch))
17651738
{
1766-
lit_utf8_iterator_decr (&front);
1767-
start = lit_utf8_iterator_get_pos (&front);
1739+
nonws_start_p = current_p;
17681740
break;
17691741
}
1742+
1743+
current_p += read_size;
17701744
}
17711745

1746+
current_p = utf8_str_p + buffer_size;
1747+
17721748
/* Trim back. */
1773-
while (!lit_utf8_iterator_is_bos (&back))
1749+
while (current_p > utf8_str_p)
17741750
{
1775-
current = lit_utf8_iterator_read_prev (&back);
1776-
if (!lit_char_is_white_space (current)
1777-
&& !lit_char_is_line_terminator (current))
1751+
read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch);
1752+
1753+
if (!lit_char_is_white_space (ch)
1754+
&& !lit_char_is_line_terminator (ch))
17781755
{
1779-
lit_utf8_iterator_incr (&back);
1780-
end = lit_utf8_iterator_get_pos (&back);
17811756
break;
17821757
}
1758+
1759+
current_p -= read_size;
17831760
}
17841761

17851762
/* Construct new string. */
1786-
if (end.offset > start.offset)
1763+
if (current_p > nonws_start_p)
17871764
{
1788-
ret_string_p = ecma_new_ecma_string_from_utf8 (utf8_str_p + start.offset,
1789-
(lit_utf8_size_t) (end.offset - start.offset));
1765+
ret_string_p = ecma_new_ecma_string_from_utf8 (nonws_start_p,
1766+
(lit_utf8_size_t) (current_p - nonws_start_p));
17901767
}
17911768
else
17921769
{

jerry-core/ecma/builtin-objects/ecma-builtin-function.cpp

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,13 @@ ecma_builtin_function_helper_get_arguments (const ecma_value_t *arguments_list_p
9797
ssize_t sz = ecma_string_to_utf8_string (str_p, start_p, (ssize_t) str_size);
9898
JERRY_ASSERT (sz >= 0);
9999

100-
lit_utf8_iterator_t iter = lit_utf8_iterator_create (start_p, str_size);
100+
lit_utf8_byte_t *current_p = start_p;
101+
const lit_utf8_byte_t *string_end_p = start_p + str_size;
101102

102-
while (!lit_utf8_iterator_is_eos (&iter))
103+
while (current_p < string_end_p)
103104
{
104-
ecma_char_t current_char = lit_utf8_iterator_read_next (&iter);
105+
ecma_char_t current_char;
106+
current_p += lit_read_code_unit_from_utf8 (current_p, &current_char);
105107

106108
if (current_char == ',')
107109
{
@@ -197,33 +199,36 @@ ecma_builtin_function_dispatch_construct (const ecma_value_t *arguments_list_p,
197199
ssize_t sz = ecma_string_to_utf8_string (arguments_str_p, start_p, (ssize_t) str_size);
198200
JERRY_ASSERT (sz >= 0);
199201

200-
lit_utf8_iterator_t iter = lit_utf8_iterator_create (start_p, str_size);
201-
ecma_length_t last_separator = lit_utf8_iterator_get_index (&iter);
202-
ecma_length_t end_position;
202+
lit_utf8_byte_t *current_p = start_p;
203+
lit_utf8_byte_t *last_separator = start_p;
204+
lit_utf8_byte_t *end_position;
205+
const lit_utf8_byte_t *string_end_p = start_p + str_size;
203206
ecma_string_t *param_str_p;
204207

205-
while (!lit_utf8_iterator_is_eos (&iter))
208+
while (current_p < string_end_p)
206209
{
207-
ecma_char_t current_char = lit_utf8_iterator_read_next (&iter);
210+
ecma_char_t current_char;
211+
lit_utf8_size_t read_size = lit_read_code_unit_from_utf8 (current_p, &current_char);
208212

209213
if (current_char == ',')
210214
{
211-
lit_utf8_iterator_decr (&iter);
212-
end_position = lit_utf8_iterator_get_index (&iter);
215+
end_position = current_p;
213216

214-
param_str_p = ecma_string_substr (arguments_str_p, last_separator, end_position);
217+
param_str_p = ecma_new_ecma_string_from_utf8 (last_separator,
218+
(lit_utf8_size_t) (end_position - last_separator));
215219
string_params_p[params_count] = ecma_string_trim (param_str_p);
216220
ecma_deref_ecma_string (param_str_p);
217221

218-
lit_utf8_iterator_incr (&iter);
219-
last_separator = lit_utf8_iterator_get_index (&iter);
220-
222+
last_separator = current_p + read_size;
221223
params_count++;
222224
}
225+
226+
current_p += read_size;
223227
}
224228

225-
end_position = lit_utf8_string_length (start_p, str_size);
226-
param_str_p = ecma_string_substr (arguments_str_p, last_separator, end_position);
229+
end_position = (lit_utf8_byte_t *) string_end_p;
230+
param_str_p = ecma_new_ecma_string_from_utf8 (last_separator,
231+
(lit_utf8_size_t) (end_position - last_separator));
227232
string_params_p[params_count] = ecma_string_trim (param_str_p);
228233
ecma_deref_ecma_string (param_str_p);
229234
params_count++;

0 commit comments

Comments
 (0)