|
| 1 | +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Yagiz Nizipli <yagiz@nizipli.com> |
| 3 | +Date: Tue, 18 Feb 2025 11:21:51 -0500 |
| 4 | +Subject: add processed_characters option to WriteUtf8V2 |
| 5 | + |
| 6 | +Bug: https://issues.chromium.org/issues/397377176 |
| 7 | + |
| 8 | +Change-Id: I22086a675eb5565bef254a94ac1b6827a1c61a51 |
| 9 | +Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/6276706 |
| 10 | +Reviewed-by: Erik Corry <erikcorry@chromium.org> |
| 11 | +Auto-Submit: Yagiz Nizipli <yagiz@nizipli.com> |
| 12 | +Commit-Queue: Leszek Swirski <leszeks@chromium.org> |
| 13 | +Reviewed-by: Leszek Swirski <leszeks@chromium.org> |
| 14 | +Cr-Commit-Position: refs/heads/main@{#98780} |
| 15 | + |
| 16 | +diff --git a/include/v8-primitive.h b/include/v8-primitive.h |
| 17 | +index 01773bcaff9b921e77ae70ef09d8a30c1637d533..b608535abae0ed2d5ee74a327203a4bffb9847fd 100644 |
| 18 | +--- a/include/v8-primitive.h |
| 19 | ++++ b/include/v8-primitive.h |
| 20 | +@@ -257,11 +257,14 @@ class V8_EXPORT String : public Name { |
| 21 | + * \param buffer The buffer into which the string will be written. |
| 22 | + * \param capacity The number of bytes available in the output buffer. |
| 23 | + * \param flags Various flags that influence the behavior of this operation. |
| 24 | ++ * \param processed_characters_return The number of processed characters from |
| 25 | ++ * the buffer. |
| 26 | + * \return The number of bytes copied to the buffer including the null |
| 27 | + * terminator (if written). |
| 28 | + */ |
| 29 | + size_t WriteUtf8V2(Isolate* isolate, char* buffer, size_t capacity, |
| 30 | +- int flags = WriteFlags::kNone) const; |
| 31 | ++ int flags = WriteFlags::kNone, |
| 32 | ++ size_t* processed_characters_return = nullptr) const; |
| 33 | + |
| 34 | + /** |
| 35 | + * A zero length string. |
| 36 | +diff --git a/src/api/api.cc b/src/api/api.cc |
| 37 | +index fccbd853f957617c79d97dbdd69fec7c39f65af5..43540a968a5b7e94c3e883d1bf8b5f51072bae05 100644 |
| 38 | +--- a/src/api/api.cc |
| 39 | ++++ b/src/api/api.cc |
| 40 | +@@ -6163,7 +6163,8 @@ void String::WriteOneByteV2(Isolate* v8_isolate, uint32_t offset, |
| 41 | + } |
| 42 | + |
| 43 | + size_t String::WriteUtf8V2(Isolate* v8_isolate, char* buffer, size_t capacity, |
| 44 | +- int flags) const { |
| 45 | ++ int flags, |
| 46 | ++ size_t* processed_characters_return) const { |
| 47 | + auto str = Utils::OpenDirectHandle(this); |
| 48 | + i::Isolate* i_isolate = reinterpret_cast<i::Isolate*>(v8_isolate); |
| 49 | + API_RCS_SCOPE(i_isolate, String, WriteUtf8); |
| 50 | +@@ -6175,7 +6176,8 @@ size_t String::WriteUtf8V2(Isolate* v8_isolate, char* buffer, size_t capacity, |
| 51 | + if (flags & String::WriteFlags::kReplaceInvalidUtf8) { |
| 52 | + i_flags |= i::String::Utf8EncodingFlag::kReplaceInvalid; |
| 53 | + } |
| 54 | +- return i::String::WriteUtf8(i_isolate, str, buffer, capacity, i_flags); |
| 55 | ++ return i::String::WriteUtf8(i_isolate, str, buffer, capacity, i_flags, |
| 56 | ++ processed_characters_return); |
| 57 | + } |
| 58 | + |
| 59 | + namespace { |
| 60 | +diff --git a/src/objects/string.cc b/src/objects/string.cc |
| 61 | +index e6ad2e286bdb05273f0152c214f34f332d67854c..a6013dd168a31e336e79ce5019b36482b6096211 100644 |
| 62 | +--- a/src/objects/string.cc |
| 63 | ++++ b/src/objects/string.cc |
| 64 | +@@ -1111,8 +1111,8 @@ void String::WriteToFlat2(SinkCharT* dst, Tagged<ConsString> src, |
| 65 | + |
| 66 | + // static |
| 67 | + size_t String::WriteUtf8(Isolate* isolate, DirectHandle<String> string, |
| 68 | +- char* buffer, size_t capacity, |
| 69 | +- Utf8EncodingFlags flags) { |
| 70 | ++ char* buffer, size_t capacity, Utf8EncodingFlags flags, |
| 71 | ++ size_t* processed_characters_return) { |
| 72 | + DCHECK_IMPLIES(flags & Utf8EncodingFlag::kNullTerminate, capacity > 0); |
| 73 | + DCHECK_IMPLIES(capacity > 0, buffer != nullptr); |
| 74 | + |
| 75 | +@@ -1121,19 +1121,22 @@ size_t String::WriteUtf8(Isolate* isolate, DirectHandle<String> string, |
| 76 | + DisallowGarbageCollection no_gc; |
| 77 | + FlatContent content = string->GetFlatContent(no_gc); |
| 78 | + DCHECK(content.IsFlat()); |
| 79 | +- if (content.IsOneByte()) { |
| 80 | +- return unibrow::Utf8::Encode<uint8_t>( |
| 81 | +- content.ToOneByteVector(), buffer, capacity, |
| 82 | +- flags & Utf8EncodingFlag::kNullTerminate, |
| 83 | +- flags & Utf8EncodingFlag::kReplaceInvalid) |
| 84 | +- .bytes_written; |
| 85 | +- } else { |
| 86 | +- return unibrow::Utf8::Encode<uint16_t>( |
| 87 | +- content.ToUC16Vector(), buffer, capacity, |
| 88 | +- flags & Utf8EncodingFlag::kNullTerminate, |
| 89 | +- flags & Utf8EncodingFlag::kReplaceInvalid) |
| 90 | +- .bytes_written; |
| 91 | ++ |
| 92 | ++ auto encoding_result = content.IsOneByte() |
| 93 | ++ ? unibrow::Utf8::Encode<uint8_t>( |
| 94 | ++ content.ToOneByteVector(), buffer, capacity, |
| 95 | ++ flags & Utf8EncodingFlag::kNullTerminate, |
| 96 | ++ flags & Utf8EncodingFlag::kReplaceInvalid) |
| 97 | ++ : unibrow::Utf8::Encode<uint16_t>( |
| 98 | ++ content.ToUC16Vector(), buffer, capacity, |
| 99 | ++ flags & Utf8EncodingFlag::kNullTerminate, |
| 100 | ++ flags & Utf8EncodingFlag::kReplaceInvalid); |
| 101 | ++ |
| 102 | ++ if (processed_characters_return != nullptr) { |
| 103 | ++ *processed_characters_return = encoding_result.characters_processed; |
| 104 | + } |
| 105 | ++ |
| 106 | ++ return encoding_result.bytes_written; |
| 107 | + } |
| 108 | + |
| 109 | + template <typename SourceChar> |
| 110 | +diff --git a/src/objects/string.h b/src/objects/string.h |
| 111 | +index d456749e52cbbab17b95334bcc6fee18a597fe58..238310eb3d89a2e9206ff6f58c4fcfacd00bba33 100644 |
| 112 | +--- a/src/objects/string.h |
| 113 | ++++ b/src/objects/string.h |
| 114 | +@@ -553,7 +553,8 @@ V8_OBJECT class String : public Name { |
| 115 | + using Utf8EncodingFlags = base::Flags<Utf8EncodingFlag>; |
| 116 | + static size_t WriteUtf8(Isolate* isolate, DirectHandle<String> string, |
| 117 | + char* buffer, size_t capacity, |
| 118 | +- Utf8EncodingFlags flags); |
| 119 | ++ Utf8EncodingFlags flags, |
| 120 | ++ size_t* processed_characters_return = nullptr); |
| 121 | + |
| 122 | + // Returns true if this string has no unpaired surrogates and false otherwise. |
| 123 | + static inline bool IsWellFormedUnicode(Isolate* isolate, |
| 124 | +diff --git a/test/cctest/test-api.cc b/test/cctest/test-api.cc |
| 125 | +index 3e7f2aa117f64c9ef00bcd9f3492d4e39e5945f0..434a6f56ce6389f9eec463386e19cb61c08e6206 100644 |
| 126 | +--- a/test/cctest/test-api.cc |
| 127 | ++++ b/test/cctest/test-api.cc |
| 128 | +@@ -8607,6 +8607,7 @@ THREADED_TEST(StringWrite) { |
| 129 | + char utf8buf[0xD800 * 3]; |
| 130 | + uint16_t wbuf[100]; |
| 131 | + size_t len; |
| 132 | ++ size_t processed_characters; |
| 133 | + |
| 134 | + memset(utf8buf, 0x1, 1000); |
| 135 | + len = v8::String::Empty(isolate)->WriteUtf8V2( |
| 136 | +@@ -8621,8 +8622,10 @@ THREADED_TEST(StringWrite) { |
| 137 | + CHECK_EQ(0, strcmp(utf8buf, "abc\xC3\xB0\xE2\x98\x83")); |
| 138 | + |
| 139 | + memset(utf8buf, 0x1, 1000); |
| 140 | +- len = str2->WriteUtf8V2(isolate, utf8buf, 8); |
| 141 | ++ len = str2->WriteUtf8V2(isolate, utf8buf, 8, String::WriteFlags::kNone, |
| 142 | ++ &processed_characters); |
| 143 | + CHECK_EQ(8, len); |
| 144 | ++ CHECK_EQ(5, processed_characters); |
| 145 | + CHECK_EQ(0, strncmp(utf8buf, "abc\xC3\xB0\xE2\x98\x83\x01", 9)); |
| 146 | + |
| 147 | + memset(utf8buf, 0x1, 1000); |
| 148 | +@@ -8828,8 +8831,10 @@ THREADED_TEST(StringWrite) { |
| 149 | + |
| 150 | + memset(utf8buf, 0x1, sizeof(utf8buf)); |
| 151 | + utf8buf[5] = 'X'; |
| 152 | +- len = str->WriteUtf8V2(isolate, utf8buf, sizeof(utf8buf)); |
| 153 | ++ len = str->WriteUtf8V2(isolate, utf8buf, sizeof(utf8buf), |
| 154 | ++ String::WriteFlags::kNone, &processed_characters); |
| 155 | + CHECK_EQ(5, len); |
| 156 | ++ CHECK_EQ(5, processed_characters); |
| 157 | + CHECK_EQ('X', utf8buf[5]); // Test that the sixth character is untouched. |
| 158 | + utf8buf[5] = '\0'; |
| 159 | + CHECK_EQ(0, strcmp(utf8buf, "abcde")); |
| 160 | +@@ -8846,6 +8851,29 @@ THREADED_TEST(StringWrite) { |
| 161 | + str->WriteV2(isolate, 0, 0, nullptr); |
| 162 | + len = str->WriteUtf8V2(isolate, nullptr, 0); |
| 163 | + CHECK_EQ(0, len); |
| 164 | ++ |
| 165 | ++ std::tuple<const char*, size_t, size_t> cases[] = { |
| 166 | ++ {"\xC3\xA9", 0, 0}, // é (2-byte) but buffer is 0 |
| 167 | ++ {"\xC3\xA9", 1, 0}, // é (2-byte) but buffer is 1 |
| 168 | ++ {"\xE2\x82\xAC", 0, 0}, // € (3-byte) but buffer is 0 |
| 169 | ++ {"\xE2\x82\xAC", 1, 0}, // € (3-byte) but buffer is 1 |
| 170 | ++ {"\xE2\x82\xAC", 2, 0}, // € (3-byte) but buffer is 2 |
| 171 | ++ {"\xF0\x9F\x98\x81", 0, 0}, // 😁 (4-byte) but buffer is 0 |
| 172 | ++ {"\xF0\x9F\x98\x81", 1, 0}, // 😁 (4-byte) but buffer is 1 |
| 173 | ++ {"\xF0\x9F\x98\x81", 2, 0}, // 😁 (4-byte) but buffer is 2 |
| 174 | ++ }; |
| 175 | ++ |
| 176 | ++ for (const auto& test_case : cases) { |
| 177 | ++ auto test_str = |
| 178 | ++ String::NewFromUtf8(isolate, std::get<0>(test_case)).ToLocalChecked(); |
| 179 | ++ auto test_buffer_capacity = std::get<1>(test_case); |
| 180 | ++ char test_buffer[4]; |
| 181 | ++ len = |
| 182 | ++ test_str->WriteUtf8V2(isolate, test_buffer, test_buffer_capacity, |
| 183 | ++ String::WriteFlags::kNone, &processed_characters); |
| 184 | ++ CHECK_EQ(std::get<2>(test_case), len); |
| 185 | ++ CHECK_EQ(0, processed_characters); |
| 186 | ++ } |
| 187 | + } |
| 188 | + |
| 189 | + static void Utf16Helper(LocalContext& context, const char* name, |
0 commit comments