Skip to content

Commit 06b4490

Browse files
committed
Optimize encode/decode URI for valid UTF-8 input.
JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg@inf.u-szeged.hu
1 parent d1a5f7f commit 06b4490

File tree

3 files changed

+116
-182
lines changed

3 files changed

+116
-182
lines changed

jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp

Lines changed: 85 additions & 168 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "ecma-try-catch-macro.h"
2626
#include "jrt.h"
2727
#include "lit-magic-strings.h"
28+
#include "lit-strings.h"
2829
#include "vm.h"
2930
#include "jrt-libc-includes.h"
3031

@@ -511,7 +512,12 @@ static uint8_t unescaped_uri_component_set[16] =
511512
0xfe, 0xff, 0xff, 0x87, 0xfe, 0xff, 0xff, 0x47
512513
};
513514

514-
#define ECMA_BUILTIN_HEX_TO_BYTE_ERROR 0x100
515+
/*
516+
* Format is a percent sign followed by two hex digits.
517+
*/
518+
#define URI_ENCODED_BYTE_SIZE (3)
519+
520+
#define ECMA_BUILTIN_HEX_TO_BYTE_ERROR (0x100)
515521

516522
/**
517523
* Helper function to decode a hexadecimal byte from a string.
@@ -598,7 +604,11 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
598604

599605
while (input_char_p < input_end_p)
600606
{
601-
/* Input validation. */
607+
/*
608+
* We expect that the input is a valid UTF-8 sequence,
609+
* so characters >= 0x80 can be let through.
610+
*/
611+
602612
if (*input_char_p != '%')
603613
{
604614
output_size++;
@@ -613,9 +623,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
613623
break;
614624
}
615625

616-
input_char_p += 3;
626+
input_char_p += URI_ENCODED_BYTE_SIZE;
617627

618-
if (decoded_byte <= 0x7f)
628+
if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
619629
{
620630
/*
621631
* We don't decode those bytes, which are part of reserved_uri_bitset
@@ -624,81 +634,16 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
624634
if (ecma_builtin_global_object_character_is_in (decoded_byte, reserved_uri_bitset)
625635
&& !ecma_builtin_global_object_character_is_in (decoded_byte, unescaped_uri_component_set))
626636
{
627-
output_size += 3;
637+
output_size += URI_ENCODED_BYTE_SIZE;
628638
}
629639
else
630640
{
631641
output_size++;
632642
}
633643
}
634-
else if (decoded_byte < 0xc0 || decoded_byte >= 0xf8)
635-
{
636-
/*
637-
* Invalid UTF-8 starting bytes:
638-
* 10xx xxxx - UTF continuation byte
639-
* 1111 1xxx - maximum length is 4 bytes
640-
*/
641-
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
642-
break;
643-
}
644644
else
645645
{
646-
uint32_t count;
647-
uint32_t min;
648-
uint32_t character;
649-
650-
if (decoded_byte < 0xe0)
651-
{
652-
count = 1;
653-
min = 0x80;
654-
character = decoded_byte & 0x1f;
655-
}
656-
else if (decoded_byte < 0xf0)
657-
{
658-
count = 2;
659-
min = 0x800;
660-
character = decoded_byte & 0x0f;
661-
}
662-
else
663-
{
664-
count = 3;
665-
min = 0x1000;
666-
character = decoded_byte & 0x07;
667-
}
668-
669-
output_size += (count + 1);
670-
671-
do
672-
{
673-
decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
674-
if (decoded_byte == ECMA_BUILTIN_HEX_TO_BYTE_ERROR
675-
|| (decoded_byte & 0xc0) != 0x80)
676-
{
677-
break;
678-
}
679-
680-
character = (character << 6) + (decoded_byte & 0x3f);
681-
input_char_p += 3;
682-
}
683-
while (--count > 0);
684-
685-
if (count != 0
686-
/*
687-
* Explanation of the character < min check: according to
688-
* the UTF standard, each character must be encoded
689-
* with the minimum amount of bytes. We need to reject
690-
* those characters, which does not satisfy this condition.
691-
*/
692-
|| character < min
693-
/*
694-
* Not allowed character ranges.
695-
*/
696-
|| character > 0x10ffff
697-
|| (character >= 0xd800 && character <= 0xdfff))
698-
{
699-
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
700-
break;
701-
}
646+
output_size++;
702647
}
703648
}
704649

@@ -723,9 +668,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
723668
}
724669

725670
uint32_t decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
726-
input_char_p += 3;
671+
input_char_p += URI_ENCODED_BYTE_SIZE;
727672

728-
if (decoded_byte <= 0x7f)
673+
if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
729674
{
730675
if (ecma_builtin_global_object_character_is_in (decoded_byte, reserved_uri_bitset)
731676
&& !ecma_builtin_global_object_character_is_in (decoded_byte, unescaped_uri_component_set))
@@ -742,47 +687,40 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
742687
}
743688
else
744689
{
745-
uint32_t count;
746-
uint32_t character;
690+
*output_char_p = (lit_utf8_byte_t) decoded_byte;
691+
output_char_p++;
692+
}
693+
}
747694

748-
/* The validator already checked this before. */
749-
JERRY_ASSERT (decoded_byte >= 0xc0 && decoded_byte < 0xf8);
695+
JERRY_ASSERT (output_start_p + output_size == output_char_p);
750696

751-
if (decoded_byte < 0xe0)
752-
{
753-
count = 1;
754-
character = decoded_byte & 0x1f;
755-
}
756-
else if (decoded_byte < 0xf0)
757-
{
758-
count = 2;
759-
character = decoded_byte & 0x0f;
760-
}
761-
else
762-
{
763-
count = 3;
764-
character = decoded_byte & 0x07;
765-
}
697+
bool valid_utf8 = lit_is_utf8_string_valid (output_start_p, output_size);
766698

767-
do
699+
if (valid_utf8)
700+
{
701+
lit_utf8_iterator_t characters = lit_utf8_iterator_create (output_start_p, output_size);
702+
while (!lit_utf8_iterator_is_eos (&characters))
703+
{
704+
ecma_char_t character = lit_utf8_iterator_read_next (&characters);
705+
706+
/* Surrogate fragments are allowed in JS, but not accepted by URI decoding. */
707+
if (character >= LIT_UTF16_HIGH_SURROGATE_MIN && character <= LIT_UTF16_LOW_SURROGATE_MAX)
768708
{
769-
decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
770-
JERRY_ASSERT (decoded_byte != ECMA_BUILTIN_HEX_TO_BYTE_ERROR
771-
&& (decoded_byte & 0xc0) == 0x80);
772-
character = (character << 6) + (decoded_byte & 0x3f);
773-
input_char_p += 3;
709+
valid_utf8 = false;
710+
break;
774711
}
775-
while (--count > 0);
776-
777-
output_char_p += lit_code_point_to_utf8 (character, output_char_p);
778712
}
779713
}
780714

781-
JERRY_ASSERT (output_start_p + output_size == output_char_p);
782-
783-
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size);
784-
785-
ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
715+
if (valid_utf8)
716+
{
717+
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size);
718+
ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
719+
}
720+
else
721+
{
722+
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
723+
}
786724

787725
MEM_FINALIZE_LOCAL_ARRAY (output_start_p);
788726
}
@@ -864,11 +802,9 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
864802
lit_utf8_size_t input_size = ecma_string_get_size (input_string_p);
865803

866804
MEM_DEFINE_LOCAL_ARRAY (input_start_p,
867-
input_size + 1,
805+
input_size,
868806
lit_utf8_byte_t);
869807

870-
input_start_p[input_size] = LIT_BYTE_NULL;
871-
872808
ecma_string_to_utf8_string (input_string_p,
873809
input_start_p,
874810
(ssize_t) (input_size));
@@ -878,49 +814,51 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
878814
* and compute the length of the output, then we encode the input.
879815
*/
880816

881-
lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size);
882-
lit_utf8_size_t output_length = 1;
883-
while (!lit_utf8_iterator_is_eos (&iter))
817+
lit_utf8_byte_t *input_char_p = input_start_p;
818+
lit_utf8_byte_t *input_end_p = input_start_p + input_size;
819+
lit_utf8_size_t output_length = 0;
820+
821+
while (input_char_p < input_end_p)
884822
{
885-
/* Input validation. */
886-
lit_code_point_t character = lit_utf8_iterator_read_next (&iter);
823+
/*
824+
* We expect that the input is a valid UTF-8 sequence,
825+
* so we only need to reject stray surrogate pairs.
826+
*/
887827

888-
if (character <= 0x7f)
828+
/* Input validation. */
829+
if (*input_char_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
889830
{
890-
if (ecma_builtin_global_object_character_is_in (character, unescaped_uri_bitset))
831+
if (ecma_builtin_global_object_character_is_in (*input_char_p, unescaped_uri_bitset))
891832
{
892833
output_length++;
893834
}
894835
else
895836
{
896-
output_length += 3;
837+
output_length += URI_ENCODED_BYTE_SIZE;
897838
}
898839
}
899-
else if (character <= 0x7ff)
840+
else if (*input_char_p == (LIT_UTF8_3_BYTE_MARKER + (LIT_UTF16_HIGH_SURROGATE_MARKER >> 12)))
900841
{
901-
output_length += 6;
902-
}
903-
else if (character <= 0xffff)
904-
{
905-
if (character >= 0xd800 && character <= 0xdfff)
842+
/* The next character is in the [0xd000, 0xdfff] range. */
843+
output_length += URI_ENCODED_BYTE_SIZE;
844+
input_char_p++;
845+
JERRY_ASSERT (input_char_p < input_end_p);
846+
JERRY_ASSERT ((*input_char_p & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
847+
848+
/* If this condition is true, the next character is >= LIT_UTF16_HIGH_SURROGATE_MIN. */
849+
if (*input_char_p & 0x20)
906850
{
907851
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
908852
break;
909853
}
910-
else
911-
{
912-
output_length += 9;
913-
}
914-
}
915-
else if (character <= 0x10ffff)
916-
{
917-
output_length += 12;
854+
output_length += URI_ENCODED_BYTE_SIZE;
918855
}
919856
else
920857
{
921-
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
922-
break;
858+
output_length += URI_ENCODED_BYTE_SIZE;
923859
}
860+
861+
input_char_p++;
924862
}
925863

926864
if (ecma_is_completion_value_empty (ret_value))
@@ -929,58 +867,37 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
929867
output_length,
930868
lit_utf8_byte_t);
931869

932-
lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size);
933870
lit_utf8_byte_t *output_char_p = output_start_p;
934-
while (!lit_utf8_iterator_is_eos (&iter))
871+
input_char_p = input_start_p;
872+
873+
while (input_char_p < input_end_p)
935874
{
936875
/* Input decode. */
937-
lit_code_point_t character = lit_utf8_iterator_read_next (&iter);
938876

939-
if (character <= 0x7f)
877+
if (*input_char_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
940878
{
941-
if (ecma_builtin_global_object_character_is_in (character, unescaped_uri_bitset))
879+
if (ecma_builtin_global_object_character_is_in (*input_char_p, unescaped_uri_bitset))
942880
{
943-
*output_char_p++ = (lit_utf8_byte_t) character;
881+
*output_char_p++ = *input_char_p;
944882
}
945883
else
946884
{
947-
ecma_builtin_global_object_byte_to_hex (output_char_p, character);
948-
output_char_p += 3;
885+
ecma_builtin_global_object_byte_to_hex (output_char_p, *input_char_p);
886+
output_char_p += URI_ENCODED_BYTE_SIZE;
949887
}
950888
}
951-
else if (character <= 0x7ff)
952-
{
953-
ecma_builtin_global_object_byte_to_hex (output_char_p, 0xc0 | (character >> 6));
954-
output_char_p += 3;
955-
ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | (character & 0x3f));
956-
output_char_p += 3;
957-
}
958-
else if (character <= 0xffff)
959-
{
960-
ecma_builtin_global_object_byte_to_hex (output_char_p, 0xe0 | (character >> 12));
961-
output_char_p += 3;
962-
ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | ((character >> 6) & 0x3f));
963-
output_char_p += 3;
964-
ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | (character & 0x3f));
965-
output_char_p += 3;
966-
}
967889
else
968890
{
969-
ecma_builtin_global_object_byte_to_hex (output_char_p, 0xf0 | (character >> 18));
970-
output_char_p += 3;
971-
ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | ((character >> 12) & 0x3f));
972-
output_char_p += 3;
973-
ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | ((character >> 6) & 0x3f));
974-
output_char_p += 3;
975-
ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | (character & 0x3f));
976-
output_char_p += 3;
891+
ecma_builtin_global_object_byte_to_hex (output_char_p, *input_char_p);
892+
output_char_p += URI_ENCODED_BYTE_SIZE;
977893
}
894+
895+
input_char_p++;
978896
}
979897

980-
*output_char_p = '\0';
981-
JERRY_ASSERT (output_start_p + output_length == output_char_p + 1);
898+
JERRY_ASSERT (output_start_p + output_length == output_char_p);
982899

983-
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length - 1);
900+
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length);
984901

985902
ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
986903

jerry-core/lit/lit-strings.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#define LIT_UTF8_2_BYTE_MARKER (0xC0)
4848
#define LIT_UTF8_3_BYTE_MARKER (0xE0)
4949
#define LIT_UTF8_4_BYTE_MARKER (0xF0)
50+
#define LIT_UTF8_5_BYTE_MARKER (0xF8)
5051
#define LIT_UTF8_EXTRA_BYTE_MARKER (0x80)
5152

5253
#define LIT_UTF8_1_BYTE_MASK (0x80)
@@ -83,6 +84,11 @@
8384
*/
8485
#define LIT_ITERATOR_OFFSET_MASK ((1ull << LIT_ITERATOR_OFFSET_WIDTH) - 1)
8586

87+
/**
88+
* Byte values >= LIT_UTF8_FIRST_BYTE_MAX are not allowed in internal strings
89+
*/
90+
#define LIT_UTF8_FIRST_BYTE_MAX LIT_UTF8_5_BYTE_MARKER
91+
8692
/**
8793
* Represents position of the iterator
8894
*/

0 commit comments

Comments
 (0)