add graphemes(s) function to iterate over graphemes (represented by s…

…ubstrings) of a string s
JuliaLang · Dec 7, 2014 · 47754eb · 47754eb
1 parent b83e4bb
commit 47754eb
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 6 deletions.
diff --git a/base/exports.jl b/base/exports.jl
@@ -821,6 +821,7 @@ export
  escape_string,
  float32_isvalid,
  float64_isvalid,
+ graphemes,
  hex,
  hex2bytes,
  ind2chr,

diff --git a/base/utf8.jl b/base/utf8.jl
@@ -103,7 +103,7 @@ function getindex(s::UTF8String, r::UnitRange{Int})
  if !is_utf8_start(d[i])
  i = nextind(s,i)
  end
- if j > endof(s)
+ if j > length(d)
  throw(BoundsError())
  end
  j = nextind(s,j)-1

diff --git a/base/utf8proc.jl b/base/utf8proc.jl
@@ -4,7 +4,7 @@ module UTF8proc
 import Base: show, showcompact, ==, string, symbol, isless
 
 # also exported by Base:
-export normalize_string, is_valid_char, is_assigned_char,
+export normalize_string, graphemes, is_valid_char, is_assigned_char,
  islower, isupper, isalpha, isdigit, isnumber, isalnum,
  iscntrl, ispunct, isspace, isprint, isgraph, isblank
 
@@ -60,6 +60,8 @@ const UTF8PROC_CHARBOUND = (1<<11)
 const UTF8PROC_LUMP = (1<<12)
 const UTF8PROC_STRIPMARK = (1<<13)
 
+############################################################################
+
 let
  const p = Array(Ptr{UInt8}, 1)
  global utf8proc_map
@@ -110,6 +112,8 @@ function normalize_string(s::AbstractString, nf::Symbol)
  throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
 end
 
+############################################################################
+
 # returns UTF8PROC_CATEGORY code in 1:30 giving Unicode category
 function category_code(c)
  uint32(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
@@ -127,9 +131,6 @@ function _catcode(c::Char)
  return unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (Int32,), c))
 end
 
-# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?
-
-
 ## libc character class predicates ##
 
 islower(c::Char) = (_catcode(c) == UTF8PROC_CATEGORY_LL)
@@ -178,4 +179,46 @@ for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph",
  end
 end
 
+############################################################################
+# use UTF8PROC_CHARBOUND to iterate over graphemes
+
+immutable GraphemeIterator{S<:AbstractString}
+ s::S # original string (for generation of SubStrings)
+ data::Vector{UInt8} # graphemes preceded by 0xff
+ GraphemeIterator(s::S) = new(s, utf8proc_map(s, UTF8PROC_CHARBOUND).data)
+end
+graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
+
+import Base: length, eltype, start, next, done, show, is_utf8_start
+
+eltype{S}(::GraphemeIterator{S}) = SubString{S}
+
+function length(g::GraphemeIterator)
+ c = 0
+ for i in g.data
+ c += i == 0xff
+ end
+ return c
+end
+
+start(g::GraphemeIterator) = (1,1)
+done(g::GraphemeIterator, i) = i[1] > length(g.data)
+
+function next(g::GraphemeIterator, i)
+ di, si = i # indices in data and s, respectively
+ di += 2 # step past 0xff and start of next char in data
+ sj = nextind(g.s, si)
+ while di <= length(g.data) && g.data[di] != 0xff
+ if is_utf8_start(g.data[di])
+ sj = nextind(g.s, sj)
+ end
+ di += 1
+ end
+ return g.s[si:sj-1], (di, sj)
+end
+
+show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
+
+############################################################################
+
 end # module
diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
@@ -1399,6 +1399,14 @@ Strings
 
  For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.
 
+.. function:: graphemes(s) -> iterator over substrings of s
+
+ Returns an iterator over substrings of ``s`` that correspond to
+ the graphemes in the string, as defined by Unicode UAX #29.
+ (Roughly, these are what users would perceive as single characters,
+ even though they may contain more than one codepoint; for example
+ a letter combined with an accent mark.)
+
 .. function:: is_valid_ascii(s) -> Bool
 
  Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise.

diff --git a/test/strings.jl b/test/strings.jl
@@ -1266,3 +1266,8 @@ Base.start(jt::i9178) = (jt.nnext=0 ; jt.ndone=0 ; 0)
 Base.done(jt::i9178, n) = (jt.ndone += 1 ; n > 3)
 Base.next(jt::i9178, n) = (jt.nnext += 1 ; ("$(jt.nnext),$(jt.ndone)", n+1))
 @test join(i9178(0,0), ";") == "1,1;2,2;3,3;4,4"
+
+# make sure substrings handle last code unit even if not start of codepoint
+let s = "x\u0302"
+ @test s[1:3] == s
+end
diff --git a/test/unicode.jl b/test/unicode.jl
@@ -93,9 +93,28 @@ else
 end
 
 # check utf8proc handling of CN category constants
-
 let c_ll = 'β', c_cn = '\u038B'
  @test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL
  # check codepoint with category code CN
  @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN
 end
+
+# graphemes
+for T in (utf8,utf16,utf32)
+ for nf in (:NFC, :NFD)
+ for (s, g) in (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h",
+ "β","l","a","h",
+ "b\u0302","l","á","h"]),
+ ("", UTF8String[]),
+ ("x\u0302", ["x\u0302"]),
+ ("\U1d4c1\u0302", ["\U1d4c1\u0302"]),
+ ("\U1d4c1\u0302\U1d4c1\u0300", ["\U1d4c1\u0302",
+ "\U1d4c1\u0300"]),
+ ("x",["x"]),
+ ("abc",["a","b","c"]))
+ s_ = T(normalize_string(s, nf))
+ g_ = map(s -> normalize_string(s, nf), g)
+ @test collect(graphemes(s_)) == g_
+ end
+ end
+end