From 0fd59aed2d55d020d9e7c7b08face1e21d04dc85 Mon Sep 17 00:00:00 2001
From: Oliver <480930+rivo@users.noreply.github.com>
Date: Tue, 26 Jul 2022 23:06:04 +0100
Subject: [PATCH] Graphemes class uses StepString() internally now and
 implements word/sentence/line breaking.

---
 example_test.go => examples_test.go |  53 +++++++--
 grapheme.go                         | 162 +++++++++++++++-------------
 grapheme_test.go                    | 138 ++++++++++++++++++++++++
 sentencerules.go                    |   3 +
 step_test.go                        |  32 +++---
 wordrules.go                        |   3 +
 6 files changed, 291 insertions(+), 100 deletions(-)
 rename example_test.go => examples_test.go (84%)

diff --git a/example_test.go b/examples_test.go
similarity index 84%
rename from example_test.go
rename to examples_test.go
index 0af3bef..79591a4 100644
--- a/example_test.go
+++ b/examples_test.go
@@ -6,14 +6,6 @@ import (
 	"github.com/rivo/uniseg"
 )
 
-func ExampleGraphemes() {
-	gr := uniseg.NewGraphemes("👍🏼!")
-	for gr.Next() {
-		fmt.Printf("%x ", gr.Runes())
-	}
-	// Output: [1f44d 1f3fc] [21]
-}
-
 func ExampleGraphemeClusterCount() {
 	n := uniseg.GraphemeClusterCount("🇩🇪🏳️‍🌈")
 	fmt.Println(n)
@@ -270,3 +262,48 @@ func ExampleStepString_lineBreaking() {
 	// Output: First |line.
 	//‖Second |line.‖
 }
+
+func ExampleGraphemes_graphemes() {
+	g := uniseg.NewGraphemes("🇩🇪🏳️‍🌈")
+	for g.Next() {
+		fmt.Println(g.Str())
+	}
+	// Output: 🇩🇪
+	//🏳️‍🌈
+}
+
+func ExampleGraphemes_word() {
+	g := uniseg.NewGraphemes("Hello, world!")
+	for g.Next() {
+		fmt.Print(g.Str())
+		if g.IsWordBoundary() {
+			fmt.Print("|")
+		}
+	}
+	// Output: Hello|,| |world|!|
+}
+
+func ExampleGraphemes_sentence() {
+	g := uniseg.NewGraphemes("This is sentence 1.0. And this is sentence two.")
+	for g.Next() {
+		fmt.Print(g.Str())
+		if g.IsSentenceBoundary() {
+			fmt.Print("|")
+		}
+	}
+	// Output: This is sentence 1.0. |And this is sentence two.|
+}
+
+func ExampleGraphemes_lineBreaking() {
+	g := uniseg.NewGraphemes("First line.\nSecond line.")
+	for g.Next() {
+		fmt.Print(g.Str())
+		if g.LineBreak() == uniseg.LineCanBreak {
+			fmt.Print("|")
+		} else if g.LineBreak() == uniseg.LineMustBreak {
+			fmt.Print("‖")
+		}
+	}
+	// Output: First |line.
+	//‖Second |line.‖
+}
diff --git a/grapheme.go b/grapheme.go
index 3ee7d49..0a8aae3 100644
--- a/grapheme.go
+++ b/grapheme.go
@@ -2,117 +2,88 @@ package uniseg
 
 import "unicode/utf8"
 
-// Graphemes implements an iterator over Unicode extended grapheme clusters,
-// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
-// "user-perceived characters". These characters often consist of multiple
-// code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
-// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
-// woman) and the rules described in Annex #29 must be applied to group those
-// code points into clusters perceived by the user as one character.
+// Graphemes implements an iterator over Unicode grapheme clusters, or
+// user-perceived characters. While iterating, it also provides information
+// about word boundaries, sentence boundaries, and line breaks.
+//
+// After constructing the class via NewGraphemes(str) for a given string "str",
+// Next() is called for every grapheme cluster in a loop until it returns false.
+// Inside the loop, information about the grapheme cluster as well as boundary
+// information is available via the various methods (see examples below).
+//
+// Using this class to iterate over a string is convenient but it is much slower
+// than using this package's Step() or StepString() functions or any of the
+// other specialized functions starting with "First".
 type Graphemes struct {
-	// The code points over which this class iterates.
-	codePoints []rune
+	// The original string.
+	original string
+
+	// The remaining string to be parsed.
+	remaining string
 
-	// The (byte-based) indices of the code points into the original string plus
-	// len(original string). Thus, len(indices) = len(codePoints) + 1.
-	indices []int
+	// The current grapheme cluster.
+	cluster string
 
-	// The current grapheme cluster to be returned. These are indices into
-	// codePoints/indices. If start == end, we either haven't started iterating
-	// yet (0) or the iteration has already completed (1).
-	start, end int
+	// The byte offset of the current grapheme cluster relative to the original
+	// string.
+	offset int
 
-	// The index of the next code point to be parsed.
-	pos int
+	// The current boundary information of the Step() parser.
+	boundaries int
 
-	// The current state of the Grapheme code point parser.
-	graphemeState int
+	// The current state of the Step() parser.
+	state int
 }
 
 // NewGraphemes returns a new grapheme cluster iterator.
 func NewGraphemes(s string) *Graphemes {
-	l := utf8.RuneCountInString(s)
-	codePoints := make([]rune, l)
-	indices := make([]int, l+1)
-	i := 0
-	for pos, r := range s {
-		codePoints[i] = r
-		indices[i] = pos
-		i++
-	}
-	indices[l] = len(s)
-	g := &Graphemes{
-		codePoints: codePoints,
-		indices:    indices,
-	}
-	g.Next() // Parse ahead.
-	return g
+	return &Graphemes{
+		original:  s,
+		remaining: s,
+		state:     -1,
+	}
 }
 
 // Next advances the iterator by one grapheme cluster and returns false if no
 // clusters are left. This function must be called before the first cluster is
 // accessed.
 func (g *Graphemes) Next() bool {
-	g.start = g.end
-
-	// The state transition gives us a boundary instruction BEFORE the next code
-	// point so we always need to stay ahead by one code point.
-
-	// Parse the next code point.
-	for g.pos <= len(g.codePoints) {
-		// GB2.
-		if g.pos == len(g.codePoints) {
-			g.end = g.pos
-			g.pos++
-			break
-		}
-
-		// Calculate the next state.
-		var boundary bool
-		g.graphemeState, boundary = transitionGraphemeState(g.graphemeState, g.codePoints[g.pos])
-
-		// If we found a cluster boundary, let's stop here. The current cluster will
-		// be the one that just ended.
-		if g.pos == 0 /* GB1 */ || boundary {
-			g.end = g.pos
-			g.pos++
-			break
-		}
-
-		g.pos++
+	if len(g.remaining) == 0 {
+		// We're already past the end.
+		g.state = -2
+		g.cluster = ""
+		return false
 	}
-
-	return g.start != g.end
+	g.offset += len(g.cluster)
+	g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
+	return true
 }
 
 // Runes returns a slice of runes (code points) which corresponds to the current
 // grapheme cluster. If the iterator is already past the end or Next() has not
 // yet been called, nil is returned.
 func (g *Graphemes) Runes() []rune {
-	if g.start == g.end {
+	if g.state < 0 {
 		return nil
 	}
-	return g.codePoints[g.start:g.end]
+	return []rune(g.cluster)
 }
 
 // Str returns a substring of the original string which corresponds to the
 // current grapheme cluster. If the iterator is already past the end or Next()
 // has not yet been called, an empty string is returned.
 func (g *Graphemes) Str() string {
-	if g.start == g.end {
-		return ""
-	}
-	return string(g.codePoints[g.start:g.end])
+	return g.cluster
 }
 
 // Bytes returns a byte slice which corresponds to the current grapheme cluster.
 // If the iterator is already past the end or Next() has not yet been called,
 // nil is returned.
 func (g *Graphemes) Bytes() []byte {
-	if g.start == g.end {
+	if g.state < 0 {
 		return nil
 	}
-	return []byte(string(g.codePoints[g.start:g.end]))
+	return []byte(g.cluster)
 }
 
 // Positions returns the interval of the current grapheme cluster as byte
@@ -122,14 +93,53 @@ func (g *Graphemes) Bytes() []byte {
 // the original string "str". If Next() has not yet been called, both values are
 // 0. If the iterator is already past the end, both values are 1.
 func (g *Graphemes) Positions() (int, int) {
-	return g.indices[g.start], g.indices[g.end]
+	if g.state == -1 {
+		return 0, 0
+	} else if g.state == -2 {
+		return 1, 1
+	}
+	return g.offset, g.offset + len(g.cluster)
+}
+
+// IsWordBoundary returns true if a word ends after the current grapheme
+// cluster.
+func (g *Graphemes) IsWordBoundary() bool {
+	if g.state < 0 {
+		return true
+	}
+	return g.boundaries&MaskWord != 0
+}
+
+// IsSentenceBoundary returns true if a sentence ends after the current
+// grapheme cluster.
+func (g *Graphemes) IsSentenceBoundary() bool {
+	if g.state < 0 {
+		return true
+	}
+	return g.boundaries&MaskSentence != 0
+}
+
+// LineBreak returns whether the line can be broken after the current grapheme
+// cluster. A value of LineDontBreak means the line may not be broken, a value
+// of LineMustBreak means the line must be broken, and a value of LineCanBreak
+// means the line may or may not be broken.
+func (g *Graphemes) LineBreak() int {
+	if g.state == -1 {
+		return LineDontBreak
+	}
+	if g.state == -2 {
+		return LineMustBreak
+	}
+	return g.boundaries & MaskLine
 }
 
 // Reset puts the iterator into its initial state such that the next call to
 // Next() sets it to the first grapheme cluster again.
 func (g *Graphemes) Reset() {
-	g.start, g.end, g.pos, g.graphemeState = 0, 0, 0, grAny
-	g.Next() // Parse ahead again.
+	g.state = -1
+	g.offset = 0
+	g.cluster = ""
+	g.remaining = g.original
 }
 
 // GraphemeClusterCount returns the number of user-perceived characters
diff --git a/grapheme_test.go b/grapheme_test.go
index c6ea414..fd5e2d2 100644
--- a/grapheme_test.go
+++ b/grapheme_test.go
@@ -104,6 +104,137 @@ func TestGraphemesClass(t *testing.T) {
 	}
 }
 
+// Run the standard Unicode test cases for word boundaries using the Graphemes
+// class.
+func TestGraphemesClassWord(t *testing.T) {
+	for testNum, testCase := range wordBreakTestCases {
+		if testNum == 1700 {
+			// This test case reveals an inconsistency in the Unicode rule set,
+			// namely the handling of ZWJ within two RI graphemes. (Grapheme
+			// rules will restart the RI count, word rules will ignore the ZWJ.)
+			// An error has been reported.
+			continue
+		}
+		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
+		testNum,
+		strings.TrimSpace(testCase.original),
+		testCase.expected,
+		decomposed(testCase.original),
+		[]rune(testCase.original))*/
+		gr := NewGraphemes(testCase.original)
+		var (
+			index   int
+			cluster []rune
+		)
+	GraphemeLoop:
+		for gr.Next() {
+			if index >= len(testCase.expected) {
+				t.Errorf(`Test case %d %q failed: More words returned than expected %d`,
+					testNum,
+					testCase.original,
+					len(testCase.expected))
+				break
+			}
+			cluster = append(cluster, gr.Runes()...)
+			if gr.IsWordBoundary() {
+				if len(cluster) != len(testCase.expected[index]) {
+					t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`,
+						testNum,
+						testCase.original,
+						index,
+						len(cluster),
+						cluster,
+						len(testCase.expected[index]),
+						testCase.expected[index])
+					break
+				}
+				for i, r := range cluster {
+					if r != testCase.expected[index][i] {
+						t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`,
+							testNum,
+							testCase.original,
+							index,
+							cluster,
+							testCase.expected[index])
+						break GraphemeLoop
+					}
+				}
+				cluster = nil
+				index++
+			}
+		}
+		if index < len(testCase.expected) {
+			t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`,
+				testNum,
+				testCase.original,
+				index,
+				len(testCase.expected))
+		}
+	}
+}
+
+// Run the standard Unicode test cases for sentence boundaries using the
+// Graphemes class.
+func TestGraphemesClassSentence(t *testing.T) {
+	for testNum, testCase := range sentenceBreakTestCases {
+		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
+		testNum,
+		strings.TrimSpace(testCase.original),
+		testCase.expected,
+		decomposed(testCase.original),
+		[]rune(testCase.original))*/
+		gr := NewGraphemes(testCase.original)
+		var (
+			index   int
+			cluster []rune
+		)
+	GraphemeLoop:
+		for gr.Next() {
+			if index >= len(testCase.expected) {
+				t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`,
+					testNum,
+					testCase.original,
+					len(testCase.expected))
+				break
+			}
+			cluster = append(cluster, gr.Runes()...)
+			if gr.IsSentenceBoundary() {
+				if len(cluster) != len(testCase.expected[index]) {
+					t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`,
+						testNum,
+						testCase.original,
+						index,
+						len(cluster),
+						cluster,
+						len(testCase.expected[index]),
+						testCase.expected[index])
+					break
+				}
+				for i, r := range cluster {
+					if r != testCase.expected[index][i] {
+						t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`,
+							testNum,
+							testCase.original,
+							index,
+							cluster,
+							testCase.expected[index])
+						break GraphemeLoop
+					}
+				}
+				cluster = nil
+				index++
+			}
+		}
+		if index < len(testCase.expected) {
+			t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`,
+				testNum,
+				testCase.original,
+				index,
+				len(testCase.expected))
+		}
+	}
+}
+
 // Test the Str() function.
 func TestGraphemesStr(t *testing.T) {
 	gr := NewGraphemes("möp")
@@ -368,3 +499,10 @@ func BenchmarkGraphemesFunctionString(b *testing.B) {
 		}
 	}
 }
+
+func TestTest(t *testing.T) {
+	g := NewGraphemes("͏\u0600")
+	for g.Next() {
+		t.Log(g.Runes())
+	}
+}
diff --git a/sentencerules.go b/sentencerules.go
index bd87dd2..58c0479 100644
--- a/sentencerules.go
+++ b/sentencerules.go
@@ -133,6 +133,9 @@ func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newS
 		if state == sbParaSep || state == sbCR {
 			return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
 		}
+		if state < 0 {
+			return sbAny, true // SB1.
+		}
 		return state, false
 	}
 
diff --git a/step_test.go b/step_test.go
index 8be6270..f4ee3ba 100644
--- a/step_test.go
+++ b/step_test.go
@@ -95,7 +95,7 @@ func TestStepBytesWord(t *testing.T) {
 			c, b, boundaries, state = Step(b, state)
 
 			if index >= len(testCase.expected) {
-				t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`,
+				t.Errorf(`Test case %d %q failed: More words returned than expected %d`,
 					testNum,
 					testCase.original,
 					len(testCase.expected))
@@ -109,7 +109,7 @@ func TestStepBytesWord(t *testing.T) {
 			cluster := growingCluster
 			growingCluster = nil
 			if len(cluster) != len(testCase.expected[index]) {
-				t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`,
+				t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`,
 					testNum,
 					testCase.original,
 					index,
@@ -121,7 +121,7 @@ func TestStepBytesWord(t *testing.T) {
 			}
 			for i, r := range cluster {
 				if r != testCase.expected[index][i] {
-					t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`,
+					t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`,
 						testNum,
 						testCase.original,
 						index,
@@ -134,7 +134,7 @@ func TestStepBytesWord(t *testing.T) {
 			index++
 		}
 		if index < len(testCase.expected) {
-			t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`,
+			t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`,
 				testNum,
 				testCase.original,
 				index,
@@ -165,7 +165,7 @@ func TestStepBytesSentence(t *testing.T) {
 			c, b, boundaries, state = Step(b, state)
 
 			if index >= len(testCase.expected) {
-				t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`,
+				t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`,
 					testNum,
 					testCase.original,
 					len(testCase.expected))
@@ -179,7 +179,7 @@ func TestStepBytesSentence(t *testing.T) {
 			cluster := growingCluster
 			growingCluster = nil
 			if len(cluster) != len(testCase.expected[index]) {
-				t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`,
+				t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`,
 					testNum,
 					testCase.original,
 					index,
@@ -191,7 +191,7 @@ func TestStepBytesSentence(t *testing.T) {
 			}
 			for i, r := range cluster {
 				if r != testCase.expected[index][i] {
-					t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`,
+					t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`,
 						testNum,
 						testCase.original,
 						index,
@@ -204,7 +204,7 @@ func TestStepBytesSentence(t *testing.T) {
 			index++
 		}
 		if index < len(testCase.expected) {
-			t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`,
+			t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`,
 				testNum,
 				testCase.original,
 				index,
@@ -312,7 +312,7 @@ func TestStepStringWord(t *testing.T) {
 			c, str, boundaries, state = StepString(str, state)
 
 			if index >= len(testCase.expected) {
-				t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`,
+				t.Errorf(`Test case %d %q failed: More words returned than expected %d`,
 					testNum,
 					testCase.original,
 					len(testCase.expected))
@@ -326,7 +326,7 @@ func TestStepStringWord(t *testing.T) {
 			cluster := growingCluster
 			growingCluster = nil
 			if len(cluster) != len(testCase.expected[index]) {
-				t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`,
+				t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`,
 					testNum,
 					testCase.original,
 					index,
@@ -338,7 +338,7 @@ func TestStepStringWord(t *testing.T) {
 			}
 			for i, r := range cluster {
 				if r != testCase.expected[index][i] {
-					t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`,
+					t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`,
 						testNum,
 						testCase.original,
 						index,
@@ -351,7 +351,7 @@ func TestStepStringWord(t *testing.T) {
 			index++
 		}
 		if index < len(testCase.expected) {
-			t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`,
+			t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`,
 				testNum,
 				testCase.original,
 				index,
@@ -382,7 +382,7 @@ func TestStepStringSentence(t *testing.T) {
 			c, str, boundaries, state = StepString(str, state)
 
 			if index >= len(testCase.expected) {
-				t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`,
+				t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`,
 					testNum,
 					testCase.original,
 					len(testCase.expected))
@@ -396,7 +396,7 @@ func TestStepStringSentence(t *testing.T) {
 			cluster := growingCluster
 			growingCluster = nil
 			if len(cluster) != len(testCase.expected[index]) {
-				t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`,
+				t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`,
 					testNum,
 					testCase.original,
 					index,
@@ -408,7 +408,7 @@ func TestStepStringSentence(t *testing.T) {
 			}
 			for i, r := range cluster {
 				if r != testCase.expected[index][i] {
-					t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`,
+					t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`,
 						testNum,
 						testCase.original,
 						index,
@@ -421,7 +421,7 @@ func TestStepStringSentence(t *testing.T) {
 			index++
 		}
 		if index < len(testCase.expected) {
-			t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`,
+			t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`,
 				testNum,
 				testCase.original,
 				index,
diff --git a/wordrules.go b/wordrules.go
index 541e45f..325407e 100644
--- a/wordrules.go
+++ b/wordrules.go
@@ -127,6 +127,9 @@ func transitionWordBreakState(state int, r rune, b []byte, str string) (newState
 		if state == wbWSegSpace || state == wbAny|wbZWJBit {
 			return wbAny, false // We don't break but this is also not WB3d or WB3c.
 		}
+		if state < 0 {
+			return wbAny, false
+		}
 		return state, false
 	} else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
 		// WB3c.