@@ -18,14 +18,6 @@ Author: Daniel Kroening, kroening@kroening.com
1818#include < windows.h>
1919#endif
2020
21- // / Determine endianness of the architecture
22- // / \return True if the architecture is little_endian
23- bool is_little_endian_arch ()
24- {
25- uint32_t i=1 ;
26- return reinterpret_cast <uint8_t &>(i) != 0 ;
27- }
28-
2921#define BUFSIZE 100
3022
3123std::string narrow (const wchar_t *s)
@@ -140,7 +132,8 @@ static void utf8_append_code(unsigned int c, std::string &result)
140132
141133// / \param utf32:encoded wide string
142134// / \return utf8-encoded string with the same unicode characters as the input.
143- std::string utf32_to_utf8 (const std::basic_string<unsigned int > &s)
135+ std::string utf32_native_endian_to_utf8 (
136+ const std::basic_string<unsigned int > &s)
144137{
145138 std::string result;
146139
@@ -166,52 +159,37 @@ std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
166159 return argv_narrow;
167160}
168161
169- // / A helper function for dealing with different UTF16 endians
170- // / \par parameters: A 16-bit integer
171- // / \return A 16-bit integer with bytes swapped
172- uint16_t do_swap_bytes (uint16_t x)
173- {
174- const uint16_t b1 = x & 0xFFu ;
175- const uint16_t b2 = x & 0xFF00u ;
176- return static_cast <uint16_t >((b1 << 8 ) | (b2 >> 8 ));
177- }
178-
179-
180- void utf16_append_code (unsigned int code, bool swap_bytes, std::wstring &result)
162+ static void utf16_append_code (unsigned int code, std::wstring &result)
181163{
182164 // we do not treat 0xD800 to 0xDFFF, although
183165 // they are not valid unicode symbols
184166
185167 if (code<0xFFFF )
186- { // code is encoded as one UTF16 character
187- // we just take the code and possibly swap the bytes
188- const unsigned int a =
189- swap_bytes ? do_swap_bytes (static_cast <uint16_t >(code)) : code;
190- result+=static_cast <wchar_t >(a);
168+ {
169+ // code is encoded as one UTF16 character
170+ result += static_cast <wchar_t >(code);
191171 }
192172 else // code is encoded as two UTF16 characters
193173 {
194174 // if this is valid unicode, we have
195175 // code<0x10FFFF
196176 // but let's not check it programmatically
197177
198- // encode the code in UTF16, possibly swapping bytes.
178+ // encode the code in UTF16
199179 code=code-0x10000 ;
200180 const uint16_t i1 = static_cast <uint16_t >(((code >> 10 ) & 0x3ff ) | 0xD800 );
201- const uint16_t a1 = swap_bytes ? do_swap_bytes (i1) : i1;
202- result+=static_cast <wchar_t >(a1);
181+ result += static_cast <wchar_t >(i1);
203182 const uint16_t i2 = static_cast <uint16_t >((code & 0x3ff ) | 0xDC00 );
204- const uint16_t a2 = swap_bytes ? do_swap_bytes (i2) : i2;
205- result+=static_cast <wchar_t >(a2);
183+ result += static_cast <wchar_t >(i2);
206184 }
207185}
208186
209187
210- // / \par parameters: String in UTF-8 format, bool value indicating whether the
211- // / endianness should be different from the architecture one.
188+ // / Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
189+ // / \par parameters: String in UTF-8 format
212190// / \return String in UTF-16 format. The encoding follows the endianness of the
213191// / architecture iff swap_bytes is true.
214- std::wstring utf8_to_utf16 (const std::string& in, bool swap_bytes )
192+ std::wstring utf8_to_utf16_native_endian (const std::string& in)
215193{
216194 std::wstring result;
217195 result.reserve (in.size ());
@@ -264,33 +242,17 @@ std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
264242 code=32 ;
265243 }
266244
267- utf16_append_code (code, swap_bytes, result);
245+ utf16_append_code (code, result);
268246 }
269247
270248 return result;
271249}
272250
273- // / \par parameters: String in UTF-8 format
274- // / \return String in UTF-16BE format
275- std::wstring utf8_to_utf16_big_endian (const std::string &in)
276- {
277- bool swap_bytes=is_little_endian_arch ();
278- return utf8_to_utf16 (in, swap_bytes);
279- }
280-
281- // / \par parameters: String in UTF-8 format
282- // / \return String in UTF-16LE format
283- std::wstring utf8_to_utf16_little_endian (const std::string &in)
284- {
285- bool swap_bytes=!is_little_endian_arch ();
286- return utf8_to_utf16 (in, swap_bytes);
287- }
288-
289- // / \param ch: UTF-16LE character
251+ // / \param ch: UTF-16 character in architecture-native endianness encoding
290252// / \param result: stream to receive string in US-ASCII format, with \\uxxxx
291253// / escapes for other characters
292254// / \param loc: locale to check for printable characters
293- static void utf16_little_endian_to_java (
255+ static void utf16_native_endian_to_java (
294256 const wchar_t ch,
295257 std::ostringstream &result,
296258 const std::locale &loc)
@@ -327,23 +289,23 @@ static void utf16_little_endian_to_java(
327289 }
328290}
329291
330- // / \param ch: UTF-16LE character
292+ // / \param ch: UTF-16 character in architecture-native endianness encoding
331293// / \return String in US-ASCII format, with \\uxxxx escapes for other characters
332- std::string utf16_little_endian_to_java (const wchar_t ch)
294+ std::string utf16_native_endian_to_java (const wchar_t ch)
333295{
334296 std::ostringstream result;
335297 const std::locale loc;
336- utf16_little_endian_to_java (ch, result, loc);
298+ utf16_native_endian_to_java (ch, result, loc);
337299 return result.str ();
338300}
339301
340- // / \param in: String in UTF-16LE format
302+ // / \param in: String in UTF-16 (native endianness) format
341303// / \return String in US-ASCII format, with \\uxxxx escapes for other characters
342- std::string utf16_little_endian_to_java (const std::wstring &in)
304+ std::string utf16_native_endian_to_java (const std::wstring &in)
343305{
344306 std::ostringstream result;
345307 const std::locale loc;
346308 for (const auto ch : in)
347- utf16_little_endian_to_java (ch, result, loc);
309+ utf16_native_endian_to_java (ch, result, loc);
348310 return result.str ();
349311}
0 commit comments