Skip to content

Commit

Permalink
add language aliases for broader support. (dgraph-io#2602)
Browse files Browse the repository at this point in the history
* add language aliases for broader support.

Reuse know language stopwords with similar languages. If/when the
support for the languages is added the aliases are ignored.

* added a test for all supported and potential fulltext index language tokenizers

Ref: dgraph-io#2601
  • Loading branch information
Gus authored and dna2github committed Jul 19, 2019
1 parent 36057e4 commit 86055b2
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 29 deletions.
2 changes: 1 addition & 1 deletion edgraph/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ func (s *Server) Mutate(ctx context.Context, mu *api.Mutation) (resp *api.Assign
resp.Context, err = query.ApplyMutations(ctx, m)
if !mu.CommitNow {
if err == y.ErrConflict {
err = status.Errorf(codes.FailedPrecondition, err.Error())
err = status.Error(codes.FailedPrecondition, err.Error())
}
return resp, err
}
Expand Down
4 changes: 3 additions & 1 deletion posting/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import (
"time"

"golang.org/x/net/trace"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

"github.com/dgraph-io/badger"

Expand Down Expand Up @@ -54,7 +56,7 @@ func indexTokens(attr, lang string, src types.Val) ([]string, error) {
if ok {
it = newTokenizer
} else {
return nil, x.Errorf("Tokenizer not available for language: %s", lang)
return nil, status.Errorf(codes.Internal, "Tokenizer not available for language: %s", lang)
}
}
if schemaType == types.StringID {
Expand Down
11 changes: 11 additions & 0 deletions posting/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,17 @@ func TestIndexingInvalidLang(t *testing.T) {
require.Error(t, err)
}

func TestIndexingAliasedLang(t *testing.T) {
schema.ParseBytes([]byte("name:string @index(fulltext) @lang ."), 1)
_, err := indexTokens("name", "es", types.Val{types.StringID, []byte("base")})
require.NoError(t, err)
// es-es and es-419 are aliased to es
_, err = indexTokens("name", "es-es", types.Val{types.StringID, []byte("alias")})
require.NoError(t, err)
_, err = indexTokens("name", "es-419", types.Val{types.StringID, []byte("alias")})
require.NoError(t, err)
}

func addMutation(t *testing.T, l *List, edge *pb.DirectedEdge, op uint32,
startTs uint64, commitTs uint64, index bool) {
if op == Del {
Expand Down
64 changes: 37 additions & 27 deletions tok/fts.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
package tok

import (
"strings"

"github.com/dgraph-io/dgraph/x"

"github.com/blevesearch/bleve/analysis/analyzer/custom"
Expand Down Expand Up @@ -48,15 +50,18 @@ func initFullTextTokenizers() {
continue
}

defineStemmer(lang)
defineStopWordsList(lang)
defineAnalyzer(lang)
registerTokenizer(&FullTextTokenizer{Lang: countryCode(lang)})
for _, cc := range countryCodes(lang) {
defineStemmer(cc, lang)
defineStopWordsList(cc, lang)
defineAnalyzer(cc)
registerTokenizer(&FullTextTokenizer{Lang: cc})
}
}

for _, lang := range [...]string{"chinese", "japanese", "korean"} {
defineCJKAnalyzer(lang)
registerTokenizer(&FullTextTokenizer{Lang: countryCode(lang)})
cc := countryCode(lang)
defineCJKAnalyzer(cc)
registerTokenizer(&FullTextTokenizer{Lang: cc})
}

// Default full text tokenizer, with Porter stemmer (it works with English only).
Expand All @@ -74,16 +79,16 @@ func defineNormalizer() {
x.Check(err)
}

func defineStemmer(lang string) {
_, err := bleveCache.DefineTokenFilter(stemmerName(countryCode(lang)), map[string]interface{}{
func defineStemmer(cc, lang string) {
_, err := bleveCache.DefineTokenFilter(stemmerName(cc), map[string]interface{}{
"type": stemmer.Name,
"lang": lang,
})
x.Check(err)
}

func defineStopWordsList(lang string) {
name := stopWordsListName(countryCode(lang))
func defineStopWordsList(cc, lang string) {
name := stopWordsListName(cc)
_, err := bleveCache.DefineTokenMap(name, map[string]interface{}{
"type": tokenmap.Name,
"tokens": stopwords[lang],
Expand Down Expand Up @@ -116,22 +121,22 @@ func defineDefaultFullTextAnalyzer() {
lowercase.Name,
normalizerName,
stopWordsListName("en"),
porter.Name},
porter.Name,
},
})
x.Check(err)
}

// full text search analyzer - does language-specific stop-words removal and stemming
func defineAnalyzer(lang string) {
ln := countryCode(lang)
_, err := bleveCache.DefineAnalyzer(FtsTokenizerName(ln), map[string]interface{}{
func defineAnalyzer(cc string) {
_, err := bleveCache.DefineAnalyzer(FtsTokenizerName(cc), map[string]interface{}{
"type": custom.Name,
"tokenizer": unicode.Name,
"token_filters": []string{
lowercase.Name,
normalizerName,
stopWordsListName(ln),
stemmerName(ln),
stopWordsListName(cc),
stemmerName(cc),
},
})
x.Check(err)
Expand All @@ -140,9 +145,8 @@ func defineAnalyzer(lang string) {
// Full text search analyzer - does Chinese/Japanese/Korean style bigram
// tokenization. It's language unaware (so doesn't do stemming or stop
// words), but works OK in some contexts.
func defineCJKAnalyzer(lang string) {
ln := countryCode(lang)
_, err := bleveCache.DefineAnalyzer(FtsTokenizerName(ln), map[string]interface{}{
func defineCJKAnalyzer(cc string) {
_, err := bleveCache.DefineAnalyzer(FtsTokenizerName(cc), map[string]interface{}{
"type": custom.Name,
"tokenizer": unicode.Name,
"token_filters": []string{
Expand All @@ -166,18 +170,24 @@ func stopWordsListName(lang string) string {
}

func countryCode(lang string) string {
code, ok := langToCode[lang]
return countryCodes(lang)[0]
}

func countryCodes(lang string) []string {
codes, ok := langToCode[lang]
x.AssertTruef(ok, "Unsupported language: %s", lang)
return code
return strings.Split(codes, ",")
}

func init() {
// List based on https://godoc.org/golang.org/x/text/language#Tag
// It contains more languages than supported by Bleve, to enable seamless addition of new langs.
// Issue#2601: added aliasing of related languages to broaden support. When those langs are added
// the aliases won't matter.
langToCode = map[string]string{
"afrikaans": "af",
"amharic": "am",
"arabic": "ar",
"arabic": "ar,ar-001",
"modernstandardarabic": "ar-001",
"azerbaijani": "az",
"bulgarian": "bg",
Expand All @@ -187,17 +197,17 @@ func init() {
"danish": "da",
"german": "de",
"greek": "el",
"english": "en",
"english": "en,en-us,en-gb",
"americanenglish": "en-us",
"britishenglish": "en-gb",
"spanish": "es",
"spanish": "es,es-es,es-419",
"europeanspanish": "es-es",
"latinamericanspanish": "es-419",
"estonian": "et",
"persian": "fa",
"finnish": "fi",
"filipino": "fil",
"french": "fr",
"french": "fr,fr-ca",
"canadianfrench": "fr-ca",
"gujarati": "gu",
"hebrew": "he",
Expand Down Expand Up @@ -229,7 +239,7 @@ func init() {
"norwegian": "no",
"punjabi": "pa",
"polish": "pl",
"portuguese": "pt",
"portuguese": "pt,pt-br,pt-pt",
"brazilianportuguese": "pt-br",
"europeanportuguese": "pt-pt",
"romanian": "ro",
Expand All @@ -238,7 +248,7 @@ func init() {
"slovak": "sk",
"slovenian": "sl",
"albanian": "sq",
"serbian": "sr",
"serbian": "sr,sr-latn",
"serbianlatin": "sr-latn",
"swedish": "sv",
"swahili": "sw",
Expand Down
55 changes: 55 additions & 0 deletions tok/tok_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,61 @@ func TestFullTextTokenizerLang(t *testing.T) {
require.Equal(t, []string{encodeToken("auffass", id), encodeToken("katz", id)}, tokens)
}

func TestFullTextTokenizerSupportedLangs(t *testing.T) {
var tests = []struct {
lang string
ok bool
}{
{"ar", false},
{"ar-001", false},
{"bg", false},
{"ca", false},
{"cjk", false},
{"ckb", false},
{"cs", false},
{"da", true},
{"de", true},
{"el", false},
{"en", true},
{"en-us", true},
{"en-gb", true},
{"es", true},
{"es-es", true},
{"es-419", true},
{"eu", false},
{"fa", false},
{"fi", true},
{"fr", true},
{"fr-ca", true},
{"ga", false},
{"gl", false},
{"hi", false},
{"hu", true},
{"hy", false},
{"id", false},
{"in", false},
{"it", true},
{"nl", true},
{"no", true},
{"pt", true},
{"pt-br", true},
{"pt-pt", true},
{"ro", true},
{"ru", true},
{"sr", false},
{"sr-latin", false},
{"sv", true},
{"tr", true},
}
for _, test := range tests {
tokenizer, ok := GetTokenizer(FtsTokenizerName(test.lang))
require.Equal(t, test.ok, ok, "Fulltext tokenizer for %q failed", test.lang)
if test.ok {
require.NotNil(t, tokenizer)
}
}
}

func TestTermTokenizer(t *testing.T) {
tokenizer, has := GetTokenizer("term")
require.True(t, has)
Expand Down

0 comments on commit 86055b2

Please sign in to comment.