Skip to content

Commit

Permalink
Graphemes class uses StepString() internally now and implements word/…
Browse files Browse the repository at this point in the history
…sentence/line breaking.
  • Loading branch information
rivo committed Jul 26, 2022
1 parent 3081444 commit 0fd59ae
Show file tree
Hide file tree
Showing 6 changed files with 291 additions and 100 deletions.
53 changes: 45 additions & 8 deletions example_test.go → examples_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,6 @@ import (
"github.com/rivo/uniseg"
)

func ExampleGraphemes() {
gr := uniseg.NewGraphemes("👍🏼!")
for gr.Next() {
fmt.Printf("%x ", gr.Runes())
}
// Output: [1f44d 1f3fc] [21]
}

func ExampleGraphemeClusterCount() {
n := uniseg.GraphemeClusterCount("🇩🇪🏳️‍🌈")
fmt.Println(n)
Expand Down Expand Up @@ -270,3 +262,48 @@ func ExampleStepString_lineBreaking() {
// Output: First |line.
//‖Second |line.‖
}

func ExampleGraphemes_graphemes() {
g := uniseg.NewGraphemes("🇩🇪🏳️‍🌈")
for g.Next() {
fmt.Println(g.Str())
}
// Output: 🇩🇪
//🏳️‍🌈
}

func ExampleGraphemes_word() {
g := uniseg.NewGraphemes("Hello, world!")
for g.Next() {
fmt.Print(g.Str())
if g.IsWordBoundary() {
fmt.Print("|")
}
}
// Output: Hello|,| |world|!|
}

func ExampleGraphemes_sentence() {
g := uniseg.NewGraphemes("This is sentence 1.0. And this is sentence two.")
for g.Next() {
fmt.Print(g.Str())
if g.IsSentenceBoundary() {
fmt.Print("|")
}
}
// Output: This is sentence 1.0. |And this is sentence two.|
}

func ExampleGraphemes_lineBreaking() {
g := uniseg.NewGraphemes("First line.\nSecond line.")
for g.Next() {
fmt.Print(g.Str())
if g.LineBreak() == uniseg.LineCanBreak {
fmt.Print("|")
} else if g.LineBreak() == uniseg.LineMustBreak {
fmt.Print("‖")
}
}
// Output: First |line.
//‖Second |line.‖
}
162 changes: 86 additions & 76 deletions grapheme.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,117 +2,88 @@ package uniseg

import "unicode/utf8"

// Graphemes implements an iterator over Unicode extended grapheme clusters,
// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
// "user-perceived characters". These characters often consist of multiple
// code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
// woman) and the rules described in Annex #29 must be applied to group those
// code points into clusters perceived by the user as one character.
// Graphemes implements an iterator over Unicode grapheme clusters, or
// user-perceived characters. While iterating, it also provides information
// about word boundaries, sentence boundaries, and line breaks.
//
// After constructing the class via NewGraphemes(str) for a given string "str",
// Next() is called for every grapheme cluster in a loop until it returns false.
// Inside the loop, information about the grapheme cluster as well as boundary
// information is available via the various methods (see examples below).
//
// Using this class to iterate over a string is convenient but it is much slower
// than using this package's Step() or StepString() functions or any of the
// other specialized functions starting with "First".
type Graphemes struct {
// The code points over which this class iterates.
codePoints []rune
// The original string.
original string

// The remaining string to be parsed.
remaining string

// The (byte-based) indices of the code points into the original string plus
// len(original string). Thus, len(indices) = len(codePoints) + 1.
indices []int
// The current grapheme cluster.
cluster string

// The current grapheme cluster to be returned. These are indices into
// codePoints/indices. If start == end, we either haven't started iterating
// yet (0) or the iteration has already completed (1).
start, end int
// The byte offset of the current grapheme cluster relative to the original
// string.
offset int

// The index of the next code point to be parsed.
pos int
// The current boundary information of the Step() parser.
boundaries int

// The current state of the Grapheme code point parser.
graphemeState int
// The current state of the Step() parser.
state int
}

// NewGraphemes returns a new grapheme cluster iterator.
func NewGraphemes(s string) *Graphemes {
l := utf8.RuneCountInString(s)
codePoints := make([]rune, l)
indices := make([]int, l+1)
i := 0
for pos, r := range s {
codePoints[i] = r
indices[i] = pos
i++
}
indices[l] = len(s)
g := &Graphemes{
codePoints: codePoints,
indices: indices,
}
g.Next() // Parse ahead.
return g
return &Graphemes{
original: s,
remaining: s,
state: -1,
}
}

// Next advances the iterator by one grapheme cluster and returns false if no
// clusters are left. This function must be called before the first cluster is
// accessed.
func (g *Graphemes) Next() bool {
g.start = g.end

// The state transition gives us a boundary instruction BEFORE the next code
// point so we always need to stay ahead by one code point.

// Parse the next code point.
for g.pos <= len(g.codePoints) {
// GB2.
if g.pos == len(g.codePoints) {
g.end = g.pos
g.pos++
break
}

// Calculate the next state.
var boundary bool
g.graphemeState, boundary = transitionGraphemeState(g.graphemeState, g.codePoints[g.pos])

// If we found a cluster boundary, let's stop here. The current cluster will
// be the one that just ended.
if g.pos == 0 /* GB1 */ || boundary {
g.end = g.pos
g.pos++
break
}

g.pos++
if len(g.remaining) == 0 {
// We're already past the end.
g.state = -2
g.cluster = ""
return false
}

return g.start != g.end
g.offset += len(g.cluster)
g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
return true
}

// Runes returns a slice of runes (code points) which corresponds to the current
// grapheme cluster. If the iterator is already past the end or Next() has not
// yet been called, nil is returned.
func (g *Graphemes) Runes() []rune {
if g.start == g.end {
if g.state < 0 {
return nil
}
return g.codePoints[g.start:g.end]
return []rune(g.cluster)
}

// Str returns a substring of the original string which corresponds to the
// current grapheme cluster. If the iterator is already past the end or Next()
// has not yet been called, an empty string is returned.
func (g *Graphemes) Str() string {
if g.start == g.end {
return ""
}
return string(g.codePoints[g.start:g.end])
return g.cluster
}

// Bytes returns a byte slice which corresponds to the current grapheme cluster.
// If the iterator is already past the end or Next() has not yet been called,
// nil is returned.
func (g *Graphemes) Bytes() []byte {
if g.start == g.end {
if g.state < 0 {
return nil
}
return []byte(string(g.codePoints[g.start:g.end]))
return []byte(g.cluster)
}

// Positions returns the interval of the current grapheme cluster as byte
Expand All @@ -122,14 +93,53 @@ func (g *Graphemes) Bytes() []byte {
// the original string "str". If Next() has not yet been called, both values are
// 0. If the iterator is already past the end, both values are 1.
func (g *Graphemes) Positions() (int, int) {
return g.indices[g.start], g.indices[g.end]
if g.state == -1 {
return 0, 0
} else if g.state == -2 {
return 1, 1
}
return g.offset, g.offset + len(g.cluster)
}

// IsWordBoundary returns true if a word ends after the current grapheme
// cluster.
func (g *Graphemes) IsWordBoundary() bool {
if g.state < 0 {
return true
}
return g.boundaries&MaskWord != 0
}

// IsSentenceBoundary returns true if a sentence ends after the current
// grapheme cluster.
func (g *Graphemes) IsSentenceBoundary() bool {
if g.state < 0 {
return true
}
return g.boundaries&MaskSentence != 0
}

// LineBreak returns whether the line can be broken after the current grapheme
// cluster. A value of LineDontBreak means the line may not be broken, a value
// of LineMustBreak means the line must be broken, and a value of LineCanBreak
// means the line may or may not be broken.
func (g *Graphemes) LineBreak() int {
if g.state == -1 {
return LineDontBreak
}
if g.state == -2 {
return LineMustBreak
}
return g.boundaries & MaskLine
}

// Reset puts the iterator into its initial state such that the next call to
// Next() sets it to the first grapheme cluster again.
func (g *Graphemes) Reset() {
g.start, g.end, g.pos, g.graphemeState = 0, 0, 0, grAny
g.Next() // Parse ahead again.
g.state = -1
g.offset = 0
g.cluster = ""
g.remaining = g.original
}

// GraphemeClusterCount returns the number of user-perceived characters
Expand Down
Loading

0 comments on commit 0fd59ae

Please sign in to comment.