Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add graphemes(s) function to iterate over string graphemes #9261

Merged
merged 1 commit into from
Dec 17, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ Library improvements

* Efficient `mean` and `median` for ranges ([#8089]).

* `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]).

* Character predicates such as `islower()`, `isspace()`, etc. use utf8proc/libmojibake
to provide uniform cross-platform behavior and up-to-date, locale-independent support
for Unicode standards ([#5939]).
Expand Down Expand Up @@ -1132,4 +1134,6 @@ Too numerous to mention.
[#9133]: https://github.com/JuliaLang/julia/issues/9133
[#9144]: https://github.com/JuliaLang/julia/issues/9144
[#9249]: https://github.com/JuliaLang/julia/issues/9249
[#9261]: https://github.com/JuliaLang/julia/issues/9261
[#9271]: https://github.com/JuliaLang/julia/issues/9271
[#9294]: https://github.com/JuliaLang/julia/issues/9294
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,7 @@ export
escape_string,
float32_isvalid,
float64_isvalid,
graphemes,
hex,
hex2bytes,
ind2chr,
Expand Down
1 change: 0 additions & 1 deletion base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1729,4 +1729,3 @@ pointer{T<:ByteString}(x::SubString{T}, i::Integer) = pointer(x.string.data) + x
pointer(x::Union(UTF16String,UTF32String), i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data))
pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.data))
pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.data))

2 changes: 1 addition & 1 deletion base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ function getindex(s::UTF8String, r::UnitRange{Int})
if !is_utf8_start(d[i])
i = nextind(s,i)
end
if j > endof(s)
if j > length(d)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't length the number of code points? I think we want sizeof.

Sorry, it's length(d) not length(s), so this is correct, but confusing.

throw(BoundsError())
end
j = nextind(s,j)-1
Expand Down
62 changes: 58 additions & 4 deletions base/utf8proc.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Various Unicode functionality from the utf8proc library
module UTF8proc

import Base: show, showcompact, ==, string, symbol, isless
import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert

export isgraphemebreak

# also exported by Base:
export normalize_string, is_valid_char, is_assigned_char,
export normalize_string, graphemes, is_valid_char, is_assigned_char,
islower, isupper, isalpha, isdigit, isnumber, isalnum,
iscntrl, ispunct, isspace, isprint, isgraph, isblank

Expand Down Expand Up @@ -60,6 +62,8 @@ const UTF8PROC_CHARBOUND = (1<<11)
const UTF8PROC_LUMP = (1<<12)
const UTF8PROC_STRIPMARK = (1<<13)

############################################################################

let
const p = Array(Ptr{UInt8}, 1)
global utf8proc_map
Expand Down Expand Up @@ -110,6 +114,8 @@ function normalize_string(s::AbstractString, nf::Symbol)
throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
end

############################################################################

# returns UTF8PROC_CATEGORY code in 1:30 giving Unicode category
function category_code(c)
uint32(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
Expand All @@ -118,8 +124,6 @@ end

is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN

# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?

## libc character class predicates ##

islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL)
Expand Down Expand Up @@ -168,4 +172,54 @@ for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
end
end

############################################################################
# iterators for grapheme segmentation

isgraphemebreak(c1::Char, c2::Char) =
ccall(:utf8proc_grapheme_break, Bool, (Char, Char), c1, c2)

immutable GraphemeIterator{S<:AbstractString}
s::S # original string (for generation of SubStrings)
end
graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)

eltype{S}(::GraphemeIterator{S}) = SubString{S}

function length(g::GraphemeIterator)
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
n = 0
for c in g.s
n += isgraphemebreak(c0, c)
c0 = c
end
return n
end

start(g::GraphemeIterator) = start(g.s)
done(g::GraphemeIterator, i) = done(g.s, i)

function next(g::GraphemeIterator, i)
s = g.s
j = i
c0, k = next(s, i)
while !done(s, k) # loop until next grapheme is s[i:j]
c, ℓ = next(s, k)
isgraphemebreak(c0, c) && break
j = k
k = ℓ
c0 = c
end
return (s[i:j], k)
end

==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)

convert{S<:AbstractString}(::Type{S}, g::GraphemeIterator) = convert(S, g.s)

show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")

############################################################################

end # module
2 changes: 1 addition & 1 deletion deps/libmojibake
Submodule libmojibake updated from df71da to 86447a
8 changes: 8 additions & 0 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1415,6 +1415,14 @@ Strings

For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.

.. function:: graphemes(s) -> iterator over substrings of s

Returns an iterator over substrings of ``s`` that correspond to
the extended graphemes in the string, as defined by Unicode UAX #29.
(Roughly, these are what users would perceive as single characters,
even though they may contain more than one codepoint; for example
a letter combined with an accent mark is a single grapheme.)

.. function:: is_valid_ascii(s) -> Bool

Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise.
Expand Down
7 changes: 6 additions & 1 deletion test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1267,6 +1267,11 @@ Base.done(jt::i9178, n) = (jt.ndone += 1 ; n > 3)
Base.next(jt::i9178, n) = (jt.nnext += 1 ; ("$(jt.nnext),$(jt.ndone)", n+1))
@test join(i9178(0,0), ";") == "1,1;2,2;3,3;4,4"

# make sure substrings handle last code unit even if not start of codepoint
let s = "x\u0302"
@test s[1:3] == s
end

# reverseind
for T in (ASCIIString, UTF8String, UTF16String, UTF32String)
for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
Expand All @@ -1288,4 +1293,4 @@ for T in (ASCIIString, UTF8String, UTF16String, UTF32String)
end
end
end
end
end
28 changes: 27 additions & 1 deletion test/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,35 @@ else
end

# check utf8proc handling of CN category constants

let c_ll = 'β', c_cn = '\u038B'
@test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL
# check codepoint with category code CN
@test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN
end

# graphemes
let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h",
"β","l","a","h",
"b\u0302","l","á","h"]),
("", UTF8String[]),
("x\u0302", ["x\u0302"]),
("\U1d4c1\u0302", ["\U1d4c1\u0302"]),
("\U1d4c1\u0302\U1d4c1\u0300", ["\U1d4c1\u0302",
"\U1d4c1\u0300"]),
("x",["x"]),
("abc",["a","b","c"]))
for T in (utf8,utf16,utf32)
for nf in (:NFC, :NFD)
for (s, g) in grphtest
s_ = T(normalize_string(s, nf))
g_ = map(s -> normalize_string(s, nf), g)
grph = collect(graphemes(s_))
@test grph == g_
@test length(graphemes(s_)) == length(grph)
end
S = [T(normalize_string(s)) for (s,g) in grphtest]
G = map(graphemes, S)
@test map(graphemes, sort!(S)) == sort!(G)
end
end
end