Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of length on UTF8String and UTF16String #11107

Merged
merged 6 commits into from
May 6, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion base/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,24 @@ utf16_is_trail(c::UInt16) = (c & 0xfc00) == 0xdc00
utf16_is_surrogate(c::UInt16) = (c & 0xf800) == 0xd800
utf16_get_supplementary(lead::UInt16, trail::UInt16) = Char(UInt32(lead-0xd7f7)<<10 + trail)

function length(s::UTF16String)
d = s.data
len = length(d) - 1
len == 0 && return 0
cnum = 0
for i = 1:len
@inbounds cnum += !utf16_is_trail(d[i])
end
cnum
end

function endof(s::UTF16String)
d = s.data
i = length(d) - 1
i == 0 && return i
utf16_is_surrogate(d[i]) ? i-1 : i
end

function next(s::UTF16String, i::Int)
if !utf16_is_surrogate(s.data[i])
return Char(s.data[i]), i+1
Expand All @@ -32,6 +44,7 @@ function reverseind(s::UTF16String, i::Integer)
j = length(s.data) - i
return Base.utf16_is_trail(s.data[j]) ? j-1 : j
end

lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator

function reverse(s::UTF16String)
Expand All @@ -48,7 +61,7 @@ function reverse(s::UTF16String)
return UTF16String(out)
end

# TODO: optmize this
# TODO: optimize this
function encode16(s::AbstractString)
buf = UInt16[]
for ch in s
Expand Down
13 changes: 11 additions & 2 deletions base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,17 @@ function endof(s::UTF8String)
end
i
end
length(s::UTF8String) = Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t),
s.data, length(s.data)))

is_utf8_continuation(byte::UInt8) = ((byte&0xc0) == 0x80)

function length(s::UTF8String)
d = s.data
cnum = 0
for i = 1:length(d)
@inbounds cnum += !is_utf8_continuation(d[i])
end
cnum
end

function next(s::UTF8String, i::Int)
# potentially faster version
Expand Down
18 changes: 0 additions & 18 deletions src/support/utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -247,24 +247,6 @@ size_t u8_charnum(const char *s, size_t offset)
return charnum;
}

/* number of characters in NUL-terminated string */
size_t u8_strlen(const char *s)
{
size_t count = 0;
size_t i = 0, lasti;

while (1) {
lasti = i;
while (s[i] > 0)
i++;
count += (i-lasti);
if (s[i++]==0) break;
(void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
count++;
}
return count;
}

size_t u8_strwidth(const char *s)
{
uint32_t ch;
Expand Down
5 changes: 1 addition & 4 deletions src/support/utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ size_t u8_wc_toutf8(char *dest, uint32_t ch);
DLLEXPORT size_t u8_offset(const char *str, size_t charnum);

/* byte offset to character number */
DLLEXPORT size_t u8_charnum(const char *s, size_t offset);
DLLEXPORT size_t u8_charnum(const char *str, size_t offset);

/* return next character, updating an index variable */
uint32_t u8_nextchar(const char *s, size_t *i);
Expand Down Expand Up @@ -93,9 +93,6 @@ char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn);

char *u8_memrchr(const char *s, uint32_t ch, size_t sz);

/* count the number of characters in a UTF-8 string */
DLLEXPORT size_t u8_strlen(const char *s);

/* number of columns occupied by a string */
DLLEXPORT size_t u8_strwidth(const char *s);

Expand Down