Skip to content

Commit

Permalink
feat(querylang): language support for term tokenization (#6269)
Browse files Browse the repository at this point in the history
* Added support for term tokenization to tokenize by language

* Cleaned up antipattern, added ideal behaviour in tests, but commented it out
  • Loading branch information
chewxy authored Sep 7, 2020
1 parent 9109186 commit 20a067b
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 3 deletions.
18 changes: 15 additions & 3 deletions tok/tok.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package tok
import (
"encoding/binary"
"plugin"
"strings"
"time"

"github.com/golang/glog"
Expand Down Expand Up @@ -275,7 +276,9 @@ func (t HourTokenizer) IsSortable() bool { return true }
func (t HourTokenizer) IsLossy() bool { return true }

// TermTokenizer generates term tokens from string data.
type TermTokenizer struct{}
type TermTokenizer struct {
lang string
}

func (t TermTokenizer) Name() string { return "term" }
func (t TermTokenizer) Type() string { return "string" }
Expand All @@ -284,8 +287,17 @@ func (t TermTokenizer) Tokens(v interface{}) ([]string, error) {
if !ok || str == "" {
return []string{str}, nil
}
tokens := termAnalyzer.Analyze([]byte(str))
return uniqueTerms(tokens), nil
lang := LangBase(t.lang)
switch lang {
case "zh", "ja", "th", "lo", "my", "bo", "km", "kxm":
// Chinese, Japanese, Thai, Lao, Burmese, Tibetan and Khmer (km, kxm) do not use spaces as delimiters. We simply split by space.
tokens := strings.Split(str, " ")
return x.RemoveDuplicates(tokens), nil
default:
tokens := termAnalyzer.Analyze([]byte(str))
return uniqueTerms(tokens), nil
}

}
func (t TermTokenizer) Identifier() byte { return IdentTerm }
func (t TermTokenizer) IsSortable() bool { return false }
Expand Down
34 changes: 34 additions & 0 deletions tok/tok_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,18 @@ func TestTermTokenizer(t *testing.T) {
require.Equal(t, 2, len(tokens))
id := tokenizer.Identifier()
require.Equal(t, []string{encodeToken("tokenizer", id), encodeToken("works", id)}, tokens)

// TEMPORARILY COMMENTED OUT AS THIS IS THE IDEAL BEHAVIOUR. WE ARE NOT THERE YET.
/*
tokens, err = BuildTokens("Barack Obama made Obamacare", tokenizer)
require.NoError(t, err)
require.Equal(t, 3, len(tokens))
require.Equal(t, []string{
encodeToken("barack obama", id),
encodeToken("made", id),
encodeToken("obamacare", id),
})
*/
}

func TestTrigramTokenizer(t *testing.T) {
Expand Down Expand Up @@ -285,6 +297,24 @@ func TestFullTextTokenizerCJKJapanese(t *testing.T) {
checkSortedAndUnique(t, got)
}

func TestTermTokenizeCJKChinese(t *testing.T) {
tokenizer, ok := GetTokenizer("term")
require.True(t, ok)
require.NotNil(t, tokenizer)

got, err := BuildTokens("第一轮 第二轮 第一轮", GetTokenizerForLang(tokenizer, "zh"))
require.NoError(t, err)

id := tokenizer.Identifier()
wantToks := []string{
encodeToken("第一轮", id),
encodeToken("第二轮", id),
}
require.Equal(t, wantToks, got)
checkSortedAndUnique(t, got)

}

func checkSortedAndUnique(t *testing.T, tokens []string) {
if !sort.StringsAreSorted(tokens) {
t.Error("tokens were not sorted")
Expand All @@ -299,3 +329,7 @@ func checkSortedAndUnique(t *testing.T, tokens []string) {
set[tok] = struct{}{}
}
}

func BenchmarkTermTokenizer(b *testing.B) {
b.Skip() // tmp
}
2 changes: 2 additions & 0 deletions tok/tokens.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ func GetTokenizerForLang(t Tokenizer, lang string) Tokenizer {
// We must return a new instance because another goroutine might be calling this
// with a different lang.
return FullTextTokenizer{lang: lang}
case TermTokenizer:
return TermTokenizer{lang: lang}
case ExactTokenizer:
langTag, err := language.Parse(lang)
// We default to english if the language is not supported.
Expand Down

0 comments on commit 20a067b

Please sign in to comment.