From 0fd59aed2d55d020d9e7c7b08face1e21d04dc85 Mon Sep 17 00:00:00 2001 From: Oliver <480930+rivo@users.noreply.github.com> Date: Tue, 26 Jul 2022 23:06:04 +0100 Subject: [PATCH] Graphemes class uses StepString() internally now and implements word/sentence/line breaking. --- example_test.go => examples_test.go | 53 +++++++-- grapheme.go | 162 +++++++++++++++------------- grapheme_test.go | 138 ++++++++++++++++++++++++ sentencerules.go | 3 + step_test.go | 32 +++--- wordrules.go | 3 + 6 files changed, 291 insertions(+), 100 deletions(-) rename example_test.go => examples_test.go (84%) diff --git a/example_test.go b/examples_test.go similarity index 84% rename from example_test.go rename to examples_test.go index 0af3bef..79591a4 100644 --- a/example_test.go +++ b/examples_test.go @@ -6,14 +6,6 @@ import ( "github.com/rivo/uniseg" ) -func ExampleGraphemes() { - gr := uniseg.NewGraphemes("πŸ‘πŸΌ!") - for gr.Next() { - fmt.Printf("%x ", gr.Runes()) - } - // Output: [1f44d 1f3fc] [21] -} - func ExampleGraphemeClusterCount() { n := uniseg.GraphemeClusterCount("πŸ‡©πŸ‡ͺπŸ³οΈβ€πŸŒˆ") fmt.Println(n) @@ -270,3 +262,48 @@ func ExampleStepString_lineBreaking() { // Output: First |line. //β€–Second |line.β€– } + +func ExampleGraphemes_graphemes() { + g := uniseg.NewGraphemes("πŸ‡©πŸ‡ͺπŸ³οΈβ€πŸŒˆ") + for g.Next() { + fmt.Println(g.Str()) + } + // Output: πŸ‡©πŸ‡ͺ + //πŸ³οΈβ€πŸŒˆ +} + +func ExampleGraphemes_word() { + g := uniseg.NewGraphemes("Hello, world!") + for g.Next() { + fmt.Print(g.Str()) + if g.IsWordBoundary() { + fmt.Print("|") + } + } + // Output: Hello|,| |world|!| +} + +func ExampleGraphemes_sentence() { + g := uniseg.NewGraphemes("This is sentence 1.0. And this is sentence two.") + for g.Next() { + fmt.Print(g.Str()) + if g.IsSentenceBoundary() { + fmt.Print("|") + } + } + // Output: This is sentence 1.0. |And this is sentence two.| +} + +func ExampleGraphemes_lineBreaking() { + g := uniseg.NewGraphemes("First line.\nSecond line.") + for g.Next() { + fmt.Print(g.Str()) + if g.LineBreak() == uniseg.LineCanBreak { + fmt.Print("|") + } else if g.LineBreak() == uniseg.LineMustBreak { + fmt.Print("β€–") + } + } + // Output: First |line. + //β€–Second |line.β€– +} diff --git a/grapheme.go b/grapheme.go index 3ee7d49..0a8aae3 100644 --- a/grapheme.go +++ b/grapheme.go @@ -2,117 +2,88 @@ package uniseg import "unicode/utf8" -// Graphemes implements an iterator over Unicode extended grapheme clusters, -// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to -// "user-perceived characters". These characters often consist of multiple -// code points (e.g. the "woman kissing woman" emoji consists of 8 code points: -// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ + -// woman) and the rules described in Annex #29 must be applied to group those -// code points into clusters perceived by the user as one character. +// Graphemes implements an iterator over Unicode grapheme clusters, or +// user-perceived characters. While iterating, it also provides information +// about word boundaries, sentence boundaries, and line breaks. +// +// After constructing the class via NewGraphemes(str) for a given string "str", +// Next() is called for every grapheme cluster in a loop until it returns false. +// Inside the loop, information about the grapheme cluster as well as boundary +// information is available via the various methods (see examples below). +// +// Using this class to iterate over a string is convenient but it is much slower +// than using this package's Step() or StepString() functions or any of the +// other specialized functions starting with "First". type Graphemes struct { - // The code points over which this class iterates. - codePoints []rune + // The original string. + original string + + // The remaining string to be parsed. + remaining string - // The (byte-based) indices of the code points into the original string plus - // len(original string). Thus, len(indices) = len(codePoints) + 1. - indices []int + // The current grapheme cluster. + cluster string - // The current grapheme cluster to be returned. These are indices into - // codePoints/indices. If start == end, we either haven't started iterating - // yet (0) or the iteration has already completed (1). - start, end int + // The byte offset of the current grapheme cluster relative to the original + // string. + offset int - // The index of the next code point to be parsed. - pos int + // The current boundary information of the Step() parser. + boundaries int - // The current state of the Grapheme code point parser. - graphemeState int + // The current state of the Step() parser. + state int } // NewGraphemes returns a new grapheme cluster iterator. func NewGraphemes(s string) *Graphemes { - l := utf8.RuneCountInString(s) - codePoints := make([]rune, l) - indices := make([]int, l+1) - i := 0 - for pos, r := range s { - codePoints[i] = r - indices[i] = pos - i++ - } - indices[l] = len(s) - g := &Graphemes{ - codePoints: codePoints, - indices: indices, - } - g.Next() // Parse ahead. - return g + return &Graphemes{ + original: s, + remaining: s, + state: -1, + } } // Next advances the iterator by one grapheme cluster and returns false if no // clusters are left. This function must be called before the first cluster is // accessed. func (g *Graphemes) Next() bool { - g.start = g.end - - // The state transition gives us a boundary instruction BEFORE the next code - // point so we always need to stay ahead by one code point. - - // Parse the next code point. - for g.pos <= len(g.codePoints) { - // GB2. - if g.pos == len(g.codePoints) { - g.end = g.pos - g.pos++ - break - } - - // Calculate the next state. - var boundary bool - g.graphemeState, boundary = transitionGraphemeState(g.graphemeState, g.codePoints[g.pos]) - - // If we found a cluster boundary, let's stop here. The current cluster will - // be the one that just ended. - if g.pos == 0 /* GB1 */ || boundary { - g.end = g.pos - g.pos++ - break - } - - g.pos++ + if len(g.remaining) == 0 { + // We're already past the end. + g.state = -2 + g.cluster = "" + return false } - - return g.start != g.end + g.offset += len(g.cluster) + g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state) + return true } // Runes returns a slice of runes (code points) which corresponds to the current // grapheme cluster. If the iterator is already past the end or Next() has not // yet been called, nil is returned. func (g *Graphemes) Runes() []rune { - if g.start == g.end { + if g.state < 0 { return nil } - return g.codePoints[g.start:g.end] + return []rune(g.cluster) } // Str returns a substring of the original string which corresponds to the // current grapheme cluster. If the iterator is already past the end or Next() // has not yet been called, an empty string is returned. func (g *Graphemes) Str() string { - if g.start == g.end { - return "" - } - return string(g.codePoints[g.start:g.end]) + return g.cluster } // Bytes returns a byte slice which corresponds to the current grapheme cluster. // If the iterator is already past the end or Next() has not yet been called, // nil is returned. func (g *Graphemes) Bytes() []byte { - if g.start == g.end { + if g.state < 0 { return nil } - return []byte(string(g.codePoints[g.start:g.end])) + return []byte(g.cluster) } // Positions returns the interval of the current grapheme cluster as byte @@ -122,14 +93,53 @@ func (g *Graphemes) Bytes() []byte { // the original string "str". If Next() has not yet been called, both values are // 0. If the iterator is already past the end, both values are 1. func (g *Graphemes) Positions() (int, int) { - return g.indices[g.start], g.indices[g.end] + if g.state == -1 { + return 0, 0 + } else if g.state == -2 { + return 1, 1 + } + return g.offset, g.offset + len(g.cluster) +} + +// IsWordBoundary returns true if a word ends after the current grapheme +// cluster. +func (g *Graphemes) IsWordBoundary() bool { + if g.state < 0 { + return true + } + return g.boundaries&MaskWord != 0 +} + +// IsSentenceBoundary returns true if a sentence ends after the current +// grapheme cluster. +func (g *Graphemes) IsSentenceBoundary() bool { + if g.state < 0 { + return true + } + return g.boundaries&MaskSentence != 0 +} + +// LineBreak returns whether the line can be broken after the current grapheme +// cluster. A value of LineDontBreak means the line may not be broken, a value +// of LineMustBreak means the line must be broken, and a value of LineCanBreak +// means the line may or may not be broken. +func (g *Graphemes) LineBreak() int { + if g.state == -1 { + return LineDontBreak + } + if g.state == -2 { + return LineMustBreak + } + return g.boundaries & MaskLine } // Reset puts the iterator into its initial state such that the next call to // Next() sets it to the first grapheme cluster again. func (g *Graphemes) Reset() { - g.start, g.end, g.pos, g.graphemeState = 0, 0, 0, grAny - g.Next() // Parse ahead again. + g.state = -1 + g.offset = 0 + g.cluster = "" + g.remaining = g.original } // GraphemeClusterCount returns the number of user-perceived characters diff --git a/grapheme_test.go b/grapheme_test.go index c6ea414..fd5e2d2 100644 --- a/grapheme_test.go +++ b/grapheme_test.go @@ -104,6 +104,137 @@ func TestGraphemesClass(t *testing.T) { } } +// Run the standard Unicode test cases for word boundaries using the Graphemes +// class. +func TestGraphemesClassWord(t *testing.T) { + for testNum, testCase := range wordBreakTestCases { + if testNum == 1700 { + // This test case reveals an inconsistency in the Unicode rule set, + // namely the handling of ZWJ within two RI graphemes. (Grapheme + // rules will restart the RI count, word rules will ignore the ZWJ.) + // An error has been reported. + continue + } + /*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`, + testNum, + strings.TrimSpace(testCase.original), + testCase.expected, + decomposed(testCase.original), + []rune(testCase.original))*/ + gr := NewGraphemes(testCase.original) + var ( + index int + cluster []rune + ) + GraphemeLoop: + for gr.Next() { + if index >= len(testCase.expected) { + t.Errorf(`Test case %d %q failed: More words returned than expected %d`, + testNum, + testCase.original, + len(testCase.expected)) + break + } + cluster = append(cluster, gr.Runes()...) + if gr.IsWordBoundary() { + if len(cluster) != len(testCase.expected[index]) { + t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`, + testNum, + testCase.original, + index, + len(cluster), + cluster, + len(testCase.expected[index]), + testCase.expected[index]) + break + } + for i, r := range cluster { + if r != testCase.expected[index][i] { + t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`, + testNum, + testCase.original, + index, + cluster, + testCase.expected[index]) + break GraphemeLoop + } + } + cluster = nil + index++ + } + } + if index < len(testCase.expected) { + t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`, + testNum, + testCase.original, + index, + len(testCase.expected)) + } + } +} + +// Run the standard Unicode test cases for sentence boundaries using the +// Graphemes class. +func TestGraphemesClassSentence(t *testing.T) { + for testNum, testCase := range sentenceBreakTestCases { + /*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`, + testNum, + strings.TrimSpace(testCase.original), + testCase.expected, + decomposed(testCase.original), + []rune(testCase.original))*/ + gr := NewGraphemes(testCase.original) + var ( + index int + cluster []rune + ) + GraphemeLoop: + for gr.Next() { + if index >= len(testCase.expected) { + t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`, + testNum, + testCase.original, + len(testCase.expected)) + break + } + cluster = append(cluster, gr.Runes()...) + if gr.IsSentenceBoundary() { + if len(cluster) != len(testCase.expected[index]) { + t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`, + testNum, + testCase.original, + index, + len(cluster), + cluster, + len(testCase.expected[index]), + testCase.expected[index]) + break + } + for i, r := range cluster { + if r != testCase.expected[index][i] { + t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`, + testNum, + testCase.original, + index, + cluster, + testCase.expected[index]) + break GraphemeLoop + } + } + cluster = nil + index++ + } + } + if index < len(testCase.expected) { + t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`, + testNum, + testCase.original, + index, + len(testCase.expected)) + } + } +} + // Test the Str() function. func TestGraphemesStr(t *testing.T) { gr := NewGraphemes("möp") @@ -368,3 +499,10 @@ func BenchmarkGraphemesFunctionString(b *testing.B) { } } } + +func TestTest(t *testing.T) { + g := NewGraphemes("͏\u0600") + for g.Next() { + t.Log(g.Runes()) + } +} diff --git a/sentencerules.go b/sentencerules.go index bd87dd2..58c0479 100644 --- a/sentencerules.go +++ b/sentencerules.go @@ -133,6 +133,9 @@ func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newS if state == sbParaSep || state == sbCR { return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4. } + if state < 0 { + return sbAny, true // SB1. + } return state, false } diff --git a/step_test.go b/step_test.go index 8be6270..f4ee3ba 100644 --- a/step_test.go +++ b/step_test.go @@ -95,7 +95,7 @@ func TestStepBytesWord(t *testing.T) { c, b, boundaries, state = Step(b, state) if index >= len(testCase.expected) { - t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`, + t.Errorf(`Test case %d %q failed: More words returned than expected %d`, testNum, testCase.original, len(testCase.expected)) @@ -109,7 +109,7 @@ func TestStepBytesWord(t *testing.T) { cluster := growingCluster growingCluster = nil if len(cluster) != len(testCase.expected[index]) { - t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`, + t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`, testNum, testCase.original, index, @@ -121,7 +121,7 @@ func TestStepBytesWord(t *testing.T) { } for i, r := range cluster { if r != testCase.expected[index][i] { - t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`, + t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`, testNum, testCase.original, index, @@ -134,7 +134,7 @@ func TestStepBytesWord(t *testing.T) { index++ } if index < len(testCase.expected) { - t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`, + t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`, testNum, testCase.original, index, @@ -165,7 +165,7 @@ func TestStepBytesSentence(t *testing.T) { c, b, boundaries, state = Step(b, state) if index >= len(testCase.expected) { - t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`, + t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`, testNum, testCase.original, len(testCase.expected)) @@ -179,7 +179,7 @@ func TestStepBytesSentence(t *testing.T) { cluster := growingCluster growingCluster = nil if len(cluster) != len(testCase.expected[index]) { - t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`, + t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`, testNum, testCase.original, index, @@ -191,7 +191,7 @@ func TestStepBytesSentence(t *testing.T) { } for i, r := range cluster { if r != testCase.expected[index][i] { - t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`, + t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`, testNum, testCase.original, index, @@ -204,7 +204,7 @@ func TestStepBytesSentence(t *testing.T) { index++ } if index < len(testCase.expected) { - t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`, + t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`, testNum, testCase.original, index, @@ -312,7 +312,7 @@ func TestStepStringWord(t *testing.T) { c, str, boundaries, state = StepString(str, state) if index >= len(testCase.expected) { - t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`, + t.Errorf(`Test case %d %q failed: More words returned than expected %d`, testNum, testCase.original, len(testCase.expected)) @@ -326,7 +326,7 @@ func TestStepStringWord(t *testing.T) { cluster := growingCluster growingCluster = nil if len(cluster) != len(testCase.expected[index]) { - t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`, + t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`, testNum, testCase.original, index, @@ -338,7 +338,7 @@ func TestStepStringWord(t *testing.T) { } for i, r := range cluster { if r != testCase.expected[index][i] { - t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`, + t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`, testNum, testCase.original, index, @@ -351,7 +351,7 @@ func TestStepStringWord(t *testing.T) { index++ } if index < len(testCase.expected) { - t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`, + t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`, testNum, testCase.original, index, @@ -382,7 +382,7 @@ func TestStepStringSentence(t *testing.T) { c, str, boundaries, state = StepString(str, state) if index >= len(testCase.expected) { - t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`, + t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`, testNum, testCase.original, len(testCase.expected)) @@ -396,7 +396,7 @@ func TestStepStringSentence(t *testing.T) { cluster := growingCluster growingCluster = nil if len(cluster) != len(testCase.expected[index]) { - t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`, + t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`, testNum, testCase.original, index, @@ -408,7 +408,7 @@ func TestStepStringSentence(t *testing.T) { } for i, r := range cluster { if r != testCase.expected[index][i] { - t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`, + t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`, testNum, testCase.original, index, @@ -421,7 +421,7 @@ func TestStepStringSentence(t *testing.T) { index++ } if index < len(testCase.expected) { - t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`, + t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`, testNum, testCase.original, index, diff --git a/wordrules.go b/wordrules.go index 541e45f..325407e 100644 --- a/wordrules.go +++ b/wordrules.go @@ -127,6 +127,9 @@ func transitionWordBreakState(state int, r rune, b []byte, str string) (newState if state == wbWSegSpace || state == wbAny|wbZWJBit { return wbAny, false // We don't break but this is also not WB3d or WB3c. } + if state < 0 { + return wbAny, false + } return state, false } else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 { // WB3c.