@@ -7,6 +7,8 @@ Author: Daniel Kroening, kroening@kroening.com
77\*******************************************************************/
88
99#include < cstring>
10+ #include < cassert>
11+ #include < vector>
1012
1113#include " unicode.h"
1214
@@ -253,3 +255,79 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide)
253255
254256 return argv_narrow;
255257}
258+
259+
260+ /* ******************************************************************\
261+
262+ Function: utf8_to_utf16_little_endian
263+
264+ Inputs: a utf8 string
265+
266+ Outputs: a utf16 u16string
267+
268+ Purpose: converts between utf8 and utf16 strings encoded in little endian
269+
270+ \*******************************************************************/
271+
272+ std::u16string utf8_to_utf16_little_endian (const std::string& utf8)
273+ {
274+ std::vector<unsigned long > unicode;
275+ size_t i=0 ;
276+ while (i<utf8.size ())
277+ {
278+ unsigned long unicode_char;
279+ size_t size;
280+ unsigned char ch=utf8[i++];
281+
282+ if (ch<=0x7F )
283+ {
284+ unicode_char=ch;
285+ size=1 ;
286+ }
287+ else if (ch<=0xDF )
288+ {
289+ unicode_char=ch&0x1F ;
290+ size=2 ;
291+ }
292+ else if (ch<=0xEF )
293+ {
294+ unicode_char=ch&0x0F ;
295+ size=3 ;
296+ }
297+ else if (ch<=0xF7 )
298+ {
299+ unicode_char=ch&0x07 ;
300+ size=4 ;
301+ }
302+ else
303+ assert (false );
304+
305+ for (size_t j=1 ; j<size; ++j)
306+ {
307+ assert (i<utf8.size ());
308+ unsigned char ch=utf8[i++];
309+ assert (ch>=0x80 && ch<=0xBF );
310+ unicode_char<<=6 ;
311+ unicode_char+=ch&0x3F ;
312+ }
313+ assert (unicode_char<0xD800 || unicode_char>0xDFFF );
314+ assert (unicode_char<=0x10FFFF );
315+ unicode.push_back (unicode_char);
316+ }
317+
318+ std::u16string utf16;
319+ for (size_t i=0 ; i<unicode.size (); ++i)
320+ {
321+ unsigned long uchar=unicode[i];
322+ if (uchar<=0xFFFF )
323+ utf16+=(char16_t )uchar;
324+ else
325+ {
326+ // We have to take care of endianness
327+ uchar-=0x10000 ;
328+ utf16+=(char16_t )((uchar&0x3FF )+0xDC00 );
329+ utf16+=(char16_t )((uchar >> 10 )+0xD800 );
330+ }
331+ }
332+ return utf16;
333+ }
0 commit comments