diff --git a/tok/tok.go b/tok/tok.go index 771caa1284b..d2ea8c072db 100644 --- a/tok/tok.go +++ b/tok/tok.go @@ -19,6 +19,7 @@ package tok import ( "encoding/binary" "plugin" + "strings" "time" "github.com/golang/glog" @@ -275,7 +276,9 @@ func (t HourTokenizer) IsSortable() bool { return true } func (t HourTokenizer) IsLossy() bool { return true } // TermTokenizer generates term tokens from string data. -type TermTokenizer struct{} +type TermTokenizer struct { + lang string +} func (t TermTokenizer) Name() string { return "term" } func (t TermTokenizer) Type() string { return "string" } @@ -284,8 +287,17 @@ func (t TermTokenizer) Tokens(v interface{}) ([]string, error) { if !ok || str == "" { return []string{str}, nil } - tokens := termAnalyzer.Analyze([]byte(str)) - return uniqueTerms(tokens), nil + lang := LangBase(t.lang) + switch lang { + case "zh", "ja", "th", "lo", "my", "bo", "km", "kxm": + // Chinese, Japanese, Thai, Lao, Burmese, Tibetan and Khmer (km, kxm) do not use spaces as delimiters. We simply split by space. + tokens := strings.Split(str, " ") + return x.RemoveDuplicates(tokens), nil + default: + tokens := termAnalyzer.Analyze([]byte(str)) + return uniqueTerms(tokens), nil + } + } func (t TermTokenizer) Identifier() byte { return IdentTerm } func (t TermTokenizer) IsSortable() bool { return false } diff --git a/tok/tok_test.go b/tok/tok_test.go index c1b427ae36f..31f77022f5e 100644 --- a/tok/tok_test.go +++ b/tok/tok_test.go @@ -152,6 +152,18 @@ func TestTermTokenizer(t *testing.T) { require.Equal(t, 2, len(tokens)) id := tokenizer.Identifier() require.Equal(t, []string{encodeToken("tokenizer", id), encodeToken("works", id)}, tokens) + + // TEMPORARILY COMMENTED OUT AS THIS IS THE IDEAL BEHAVIOUR. WE ARE NOT THERE YET. + /* + tokens, err = BuildTokens("Barack Obama made Obamacare", tokenizer) + require.NoError(t, err) + require.Equal(t, 3, len(tokens)) + require.Equal(t, []string{ + encodeToken("barack obama", id), + encodeToken("made", id), + encodeToken("obamacare", id), + }) + */ } func TestTrigramTokenizer(t *testing.T) { @@ -285,6 +297,24 @@ func TestFullTextTokenizerCJKJapanese(t *testing.T) { checkSortedAndUnique(t, got) } +func TestTermTokenizeCJKChinese(t *testing.T) { + tokenizer, ok := GetTokenizer("term") + require.True(t, ok) + require.NotNil(t, tokenizer) + + got, err := BuildTokens("第一轮 第二轮 第一轮", GetTokenizerForLang(tokenizer, "zh")) + require.NoError(t, err) + + id := tokenizer.Identifier() + wantToks := []string{ + encodeToken("第一轮", id), + encodeToken("第二轮", id), + } + require.Equal(t, wantToks, got) + checkSortedAndUnique(t, got) + +} + func checkSortedAndUnique(t *testing.T, tokens []string) { if !sort.StringsAreSorted(tokens) { t.Error("tokens were not sorted") @@ -299,3 +329,7 @@ func checkSortedAndUnique(t *testing.T, tokens []string) { set[tok] = struct{}{} } } + +func BenchmarkTermTokenizer(b *testing.B) { + b.Skip() // tmp +} diff --git a/tok/tokens.go b/tok/tokens.go index 90cf330c517..9cf8a7e9bd8 100644 --- a/tok/tokens.go +++ b/tok/tokens.go @@ -36,6 +36,8 @@ func GetTokenizerForLang(t Tokenizer, lang string) Tokenizer { // We must return a new instance because another goroutine might be calling this // with a different lang. return FullTextTokenizer{lang: lang} + case TermTokenizer: + return TermTokenizer{lang: lang} case ExactTokenizer: langTag, err := language.Parse(lang) // We default to english if the language is not supported.