Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add String#to_utf16 and String.from_utf16 #5541

Merged
merged 1 commit into from
Jan 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions spec/std/string/utf16_spec.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
require "spec"

describe "String UTF16" do
describe "to_utf16" do
it "in the range U+0000..U+D7FF" do
encoded = "\u{0}hello\u{d7ff}".to_utf16
encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16])
end

it "in the range U+E000 to U+FFFF" do
encoded = "\u{e000}\u{ffff}".to_utf16
encoded.should eq(Slice[0xe000_u16, 0xffff_u16])
end

it "in the range U+10000..U+10FFFF" do
encoded = "\u{10000}\u{10FFFF}".to_utf16
encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16])
end

it "in the range U+D800..U+DFFF" do
encoded = "\u{D800}\u{DFFF}".to_utf16
encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16])
end
end

describe "from_utf16" do
it "in the range U+0000..U+D7FF" do
input = Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]
String.from_utf16(input).should eq("\u{0}hello\u{d7ff}")
end

it "in the range U+E000 to U+FFFF" do
input = Slice[0xe000_u16, 0xffff_u16]
String.from_utf16(input).should eq("\u{e000}\u{ffff}")
end

it "in the range U+10000..U+10FFFF" do
input = Slice[0xd800_u16, 0xdc00_u16]
String.from_utf16(input).should eq("\u{10000}")
end

it "in the range U+D800..U+DFFF" do
input = Slice[0xdc00_u16, 0xd800_u16]
String.from_utf16(input).should eq("\u{fffd}\u{fffd}")
end
end
end
3 changes: 1 addition & 2 deletions src/string.cr
Original file line number Diff line number Diff line change
Expand Up @@ -4272,5 +4272,4 @@ class String
end
end

require "./string/formatter"
require "./string/builder"
require "./string/*"
93 changes: 93 additions & 0 deletions src/string/utf16.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
class String
# Returns the UTF-16 encoding of the given *string*.
#
# Invalid chars (in the range U+D800..U+DFFF) are encoded with the
# unicode replacement char value `0xfffd`.
#
# ```
# "hi 𐂥".to_utf16 # => Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
# ```
def to_utf16 : Slice(UInt16)
size = 0
each_char do |char|
size += char.ord < 0x10000 ? 1 : 2
end

slice = Slice(UInt16).new(size)

i = 0
each_char do |char|
ord = char.ord
if ord <= 0xd800 || (0xe000 <= ord < 0x10000)
# One UInt16 is enough
slice[i] = ord.to_u16
elsif ord >= 0x10000
# Needs surrogate pair
ord -= 0x10000
slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bits
i += 1
slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bits
else
# Invalid char: use replacement
slice[i] = 0xfffd_u16
end
i += 1
end

slice
end

# Decodes the given *slice* UTF-16 sequence into a String.
#
# Invalid values are encoded using the unicode replacement char with
# codepoint `0xfffd`.
#
# ```
# slice = Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
# String.from_utf16(slice) # => "hi 𐂥"
# ```
def self.from_utf16(slice : Slice(UInt16)) : String
bytesize = 0
size = 0

each_utf16_char(slice) do |char|
bytesize += char.bytesize
size += 1
end

String.new(bytesize) do |buffer|
each_utf16_char(slice) do |char|
char.each_byte do |byte|
buffer.value = byte
buffer += 1
end
end
{bytesize, size}
end
end

# Yields each decoded char in the given slice.
private def self.each_utf16_char(slice : Slice(UInt16))
i = 0
while i < slice.size
byte = slice[i].to_i
if byte < 0xd800 || byte >= 0xe000
# One byte
codepoint = byte
elsif 0xd800 <= byte < 0xdc00 &&
(i + 1) < slice.size &&
0xdc00 <= slice[i + 1] <= 0xdfff
# Surrougate pair
codepoint = ((byte - 0xd800) << 10) + (slice[i + 1] - 0xdc00) + 0x10000
i += 1
else
# Invalid byte
codepoint = 0xfffd
end

yield codepoint.chr

i += 1
end
end
end