2525#include " ecma-try-catch-macro.h"
2626#include " jrt.h"
2727#include " lit-magic-strings.h"
28+ #include " lit-strings.h"
2829#include " vm.h"
2930#include " jrt-libc-includes.h"
3031
@@ -511,7 +512,12 @@ static uint8_t unescaped_uri_component_set[16] =
511512 0xfe , 0xff , 0xff , 0x87 , 0xfe , 0xff , 0xff , 0x47
512513};
513514
514- #define ECMA_BUILTIN_HEX_TO_BYTE_ERROR 0x100
515+ /*
516+ * Format is a percent sign followed by two hex digits.
517+ */
518+ #define URI_ENCODED_BYTE_SIZE (3 )
519+
520+ #define ECMA_BUILTIN_HEX_TO_BYTE_ERROR (0x100 )
515521
516522/* *
517523 * Helper function to decode a hexadecimal byte from a string.
@@ -598,7 +604,11 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
598604
599605 while (input_char_p < input_end_p)
600606 {
601- /* Input validation. */
607+ /*
608+ * We expect that the input is a valid UTF-8 sequence,
609+ * so characters >= 0x80 can be let through.
610+ */
611+
602612 if (*input_char_p != ' %' )
603613 {
604614 output_size++;
@@ -613,9 +623,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
613623 break ;
614624 }
615625
616- input_char_p += 3 ;
626+ input_char_p += URI_ENCODED_BYTE_SIZE ;
617627
618- if (decoded_byte <= 0x7f )
628+ if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX )
619629 {
620630 /*
621631 * We don't decode those bytes, which are part of reserved_uri_bitset
@@ -624,81 +634,16 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
624634 if (ecma_builtin_global_object_character_is_in (decoded_byte, reserved_uri_bitset)
625635 && !ecma_builtin_global_object_character_is_in (decoded_byte, unescaped_uri_component_set))
626636 {
627- output_size += 3 ;
637+ output_size += URI_ENCODED_BYTE_SIZE ;
628638 }
629639 else
630640 {
631641 output_size++;
632642 }
633643 }
634- else if (decoded_byte < 0xc0 || decoded_byte >= 0xf8 )
635- {
636- /*
637- * Invalid UTF-8 starting bytes:
638- * 10xx xxxx - UTF continuation byte
639- * 1111 1xxx - maximum length is 4 bytes
640- */
641- ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
642- break ;
643- }
644644 else
645645 {
646- uint32_t count;
647- uint32_t min;
648- uint32_t character;
649-
650- if (decoded_byte < 0xe0 )
651- {
652- count = 1 ;
653- min = 0x80 ;
654- character = decoded_byte & 0x1f ;
655- }
656- else if (decoded_byte < 0xf0 )
657- {
658- count = 2 ;
659- min = 0x800 ;
660- character = decoded_byte & 0x0f ;
661- }
662- else
663- {
664- count = 3 ;
665- min = 0x1000 ;
666- character = decoded_byte & 0x07 ;
667- }
668-
669- output_size += (count + 1 );
670-
671- do
672- {
673- decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
674- if (decoded_byte == ECMA_BUILTIN_HEX_TO_BYTE_ERROR
675- || (decoded_byte & 0xc0 ) != 0x80 )
676- {
677- break ;
678- }
679-
680- character = (character << 6 ) + (decoded_byte & 0x3f );
681- input_char_p += 3 ;
682- }
683- while (--count > 0 );
684-
685- if (count != 0
686- /*
687- * Explanation of the character < min check: according to
688- * the UTF standard, each character must be encoded
689- * with the minimum amount of bytes. We need to reject
690- * those characters, which does not satisfy this condition.
691- */
692- || character < min
693- /*
694- * Not allowed character ranges.
695- */
696- || character > 0x10ffff
697- || (character >= 0xd800 && character <= 0xdfff ))
698- {
699- ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
700- break ;
701- }
646+ output_size++;
702647 }
703648 }
704649
@@ -723,9 +668,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
723668 }
724669
725670 uint32_t decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
726- input_char_p += 3 ;
671+ input_char_p += URI_ENCODED_BYTE_SIZE ;
727672
728- if (decoded_byte <= 0x7f )
673+ if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX )
729674 {
730675 if (ecma_builtin_global_object_character_is_in (decoded_byte, reserved_uri_bitset)
731676 && !ecma_builtin_global_object_character_is_in (decoded_byte, unescaped_uri_component_set))
@@ -742,47 +687,40 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
742687 }
743688 else
744689 {
745- uint32_t count;
746- uint32_t character;
690+ *output_char_p = (lit_utf8_byte_t ) decoded_byte;
691+ output_char_p++;
692+ }
693+ }
747694
748- /* The validator already checked this before. */
749- JERRY_ASSERT (decoded_byte >= 0xc0 && decoded_byte < 0xf8 );
695+ JERRY_ASSERT (output_start_p + output_size == output_char_p);
750696
751- if (decoded_byte < 0xe0 )
752- {
753- count = 1 ;
754- character = decoded_byte & 0x1f ;
755- }
756- else if (decoded_byte < 0xf0 )
757- {
758- count = 2 ;
759- character = decoded_byte & 0x0f ;
760- }
761- else
762- {
763- count = 3 ;
764- character = decoded_byte & 0x07 ;
765- }
697+ bool valid_utf8 = lit_is_utf8_string_valid (output_start_p, output_size);
766698
767- do
699+ if (valid_utf8)
700+ {
701+ lit_utf8_iterator_t characters = lit_utf8_iterator_create (output_start_p, output_size);
702+ while (!lit_utf8_iterator_is_eos (&characters))
703+ {
704+ ecma_char_t character = lit_utf8_iterator_read_next (&characters);
705+
706+ /* Surrogate fragments are allowed in JS, but not accepted by URI decoding. */
707+ if (character >= LIT_UTF16_HIGH_SURROGATE_MIN && character <= LIT_UTF16_LOW_SURROGATE_MAX)
768708 {
769- decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
770- JERRY_ASSERT (decoded_byte != ECMA_BUILTIN_HEX_TO_BYTE_ERROR
771- && (decoded_byte & 0xc0 ) == 0x80 );
772- character = (character << 6 ) + (decoded_byte & 0x3f );
773- input_char_p += 3 ;
709+ valid_utf8 = false ;
710+ break ;
774711 }
775- while (--count > 0 );
776-
777- output_char_p += lit_code_point_to_utf8 (character, output_char_p);
778712 }
779713 }
780714
781- JERRY_ASSERT (output_start_p + output_size == output_char_p);
782-
783- ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size);
784-
785- ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
715+ if (valid_utf8)
716+ {
717+ ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size);
718+ ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
719+ }
720+ else
721+ {
722+ ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
723+ }
786724
787725 MEM_FINALIZE_LOCAL_ARRAY (output_start_p);
788726 }
@@ -864,11 +802,9 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
864802 lit_utf8_size_t input_size = ecma_string_get_size (input_string_p);
865803
866804 MEM_DEFINE_LOCAL_ARRAY (input_start_p,
867- input_size + 1 ,
805+ input_size,
868806 lit_utf8_byte_t );
869807
870- input_start_p[input_size] = LIT_BYTE_NULL;
871-
872808 ecma_string_to_utf8_string (input_string_p,
873809 input_start_p,
874810 (ssize_t ) (input_size));
@@ -878,49 +814,51 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
878814 * and compute the length of the output, then we encode the input.
879815 */
880816
881- lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size);
882- lit_utf8_size_t output_length = 1 ;
883- while (!lit_utf8_iterator_is_eos (&iter))
817+ lit_utf8_byte_t *input_char_p = input_start_p;
818+ lit_utf8_byte_t *input_end_p = input_start_p + input_size;
819+ lit_utf8_size_t output_length = 0 ;
820+
821+ while (input_char_p < input_end_p)
884822 {
885- /* Input validation. */
886- lit_code_point_t character = lit_utf8_iterator_read_next (&iter);
823+ /*
824+ * We expect that the input is a valid UTF-8 sequence,
825+ * so we only need to reject stray surrogate pairs.
826+ */
887827
888- if (character <= 0x7f )
828+ /* Input validation. */
829+ if (*input_char_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
889830 {
890- if (ecma_builtin_global_object_character_is_in (character , unescaped_uri_bitset))
831+ if (ecma_builtin_global_object_character_is_in (*input_char_p , unescaped_uri_bitset))
891832 {
892833 output_length++;
893834 }
894835 else
895836 {
896- output_length += 3 ;
837+ output_length += URI_ENCODED_BYTE_SIZE ;
897838 }
898839 }
899- else if (character <= 0x7ff )
840+ else if (*input_char_p == (LIT_UTF8_3_BYTE_MARKER + (LIT_UTF16_HIGH_SURROGATE_MARKER >> 12 )) )
900841 {
901- output_length += 6 ;
902- }
903- else if (character <= 0xffff )
904- {
905- if (character >= 0xd800 && character <= 0xdfff )
842+ /* The next character is in the [0xd000, 0xdfff] range. */
843+ output_length += URI_ENCODED_BYTE_SIZE;
844+ input_char_p++;
845+ JERRY_ASSERT (input_char_p < input_end_p);
846+ JERRY_ASSERT ((*input_char_p & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
847+
848+ /* If this condition is true, the next character is >= LIT_UTF16_HIGH_SURROGATE_MIN. */
849+ if (*input_char_p & 0x20 )
906850 {
907851 ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
908852 break ;
909853 }
910- else
911- {
912- output_length += 9 ;
913- }
914- }
915- else if (character <= 0x10ffff )
916- {
917- output_length += 12 ;
854+ output_length += URI_ENCODED_BYTE_SIZE;
918855 }
919856 else
920857 {
921- ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
922- break ;
858+ output_length += URI_ENCODED_BYTE_SIZE;
923859 }
860+
861+ input_char_p++;
924862 }
925863
926864 if (ecma_is_completion_value_empty (ret_value))
@@ -929,58 +867,37 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
929867 output_length,
930868 lit_utf8_byte_t );
931869
932- lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size);
933870 lit_utf8_byte_t *output_char_p = output_start_p;
934- while (!lit_utf8_iterator_is_eos (&iter))
871+ input_char_p = input_start_p;
872+
873+ while (input_char_p < input_end_p)
935874 {
936875 /* Input decode. */
937- lit_code_point_t character = lit_utf8_iterator_read_next (&iter);
938876
939- if (character <= 0x7f )
877+ if (*input_char_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX )
940878 {
941- if (ecma_builtin_global_object_character_is_in (character , unescaped_uri_bitset))
879+ if (ecma_builtin_global_object_character_is_in (*input_char_p , unescaped_uri_bitset))
942880 {
943- *output_char_p++ = ( lit_utf8_byte_t ) character ;
881+ *output_char_p++ = *input_char_p ;
944882 }
945883 else
946884 {
947- ecma_builtin_global_object_byte_to_hex (output_char_p, character );
948- output_char_p += 3 ;
885+ ecma_builtin_global_object_byte_to_hex (output_char_p, *input_char_p );
886+ output_char_p += URI_ENCODED_BYTE_SIZE ;
949887 }
950888 }
951- else if (character <= 0x7ff )
952- {
953- ecma_builtin_global_object_byte_to_hex (output_char_p, 0xc0 | (character >> 6 ));
954- output_char_p += 3 ;
955- ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | (character & 0x3f ));
956- output_char_p += 3 ;
957- }
958- else if (character <= 0xffff )
959- {
960- ecma_builtin_global_object_byte_to_hex (output_char_p, 0xe0 | (character >> 12 ));
961- output_char_p += 3 ;
962- ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | ((character >> 6 ) & 0x3f ));
963- output_char_p += 3 ;
964- ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | (character & 0x3f ));
965- output_char_p += 3 ;
966- }
967889 else
968890 {
969- ecma_builtin_global_object_byte_to_hex (output_char_p, 0xf0 | (character >> 18 ));
970- output_char_p += 3 ;
971- ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | ((character >> 12 ) & 0x3f ));
972- output_char_p += 3 ;
973- ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | ((character >> 6 ) & 0x3f ));
974- output_char_p += 3 ;
975- ecma_builtin_global_object_byte_to_hex (output_char_p, 0x80 | (character & 0x3f ));
976- output_char_p += 3 ;
891+ ecma_builtin_global_object_byte_to_hex (output_char_p, *input_char_p);
892+ output_char_p += URI_ENCODED_BYTE_SIZE;
977893 }
894+
895+ input_char_p++;
978896 }
979897
980- *output_char_p = ' \0 ' ;
981- JERRY_ASSERT (output_start_p + output_length == output_char_p + 1 );
898+ JERRY_ASSERT (output_start_p + output_length == output_char_p);
982899
983- ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length - 1 );
900+ ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length);
984901
985902 ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
986903
0 commit comments