Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to show suggestions for inputs that already have native language characters #53

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion govarnam/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func getVSTLookupDirs() []string {
}
}

//FindVSTDir Get the VST storing directory
// FindVSTDir Get the VST storing directory
func FindVSTDir() (string, error) {
for _, loc := range getVSTLookupDirs() {
if dirExists(loc) {
Expand Down
4 changes: 3 additions & 1 deletion govarnam/govarnam.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"sort"
"strings"
"time"
"unicode"
"unicode/utf8"

// sqlite3
Expand All @@ -25,6 +26,7 @@ type LangRules struct {
Virama string
IndicDigits bool
PatternLongestLength int // Longest length of pattern in VST
UnicodeBlock unicode.RangeTable
}

// SchemeDetails of VST
Expand Down Expand Up @@ -247,8 +249,8 @@ func (varnam *Varnam) setDefaultConfig() {
varnam.DictionaryMatchExact = false

varnam.LangRules.IndicDigits = false

varnam.LangRules.Virama, _ = varnam.getVirama()
varnam.LangRules.UnicodeBlock = varnam.getUnicodeBlock()

if varnam.SchemeDetails.LangCode == "ml" {
varnam.RegisterPatternWordPartializer(varnam.mlPatternWordPartializer)
Expand Down
13 changes: 13 additions & 0 deletions govarnam/govarnam_ml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@ func TestMLRecentlyLearnedWords(t *testing.T) {
}

result, err = varnam.GetRecentlyLearntWords(context.Background(), 4, len(words))
checkError(err)
assertEqual(t, result[0].Word, "ആലപ്പുഴ")
}

Expand All @@ -495,3 +496,15 @@ func TestMLGetSuggestions(t *testing.T) {

assertEqual(t, result[0].Word, "ആലപ്പുഴ")
}

func TestMLNativePartialWordsInInput(t *testing.T) {
varnam := getVarnamInstance("ml")

words := []string{"ആലപ്പുഴ", "പുസ്തകം"}
for _, word := range words {
varnam.Learn(word, 0)
}

assertEqual(t, varnam.TransliterateAdvanced("ആലppu").DictionarySuggestions[0].Word, "ആലപ്പുഴ")
assertEqual(t, varnam.TransliterateAdvanced("puസ്ത").DictionarySuggestions[0].Word, "പുസ്തകം")
}
14 changes: 14 additions & 0 deletions govarnam/govarnam_ml.go → govarnam/lang_specific_rules.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package govarnam

import "unicode"

/**
* govarnam - An Indian language transliteration library
* Copyright Subin Siby <mail at subinsb (.) com>, 2021
Expand All @@ -23,3 +25,15 @@ func (varnam *Varnam) mlPatternWordPartializer(sug *Suggestion) {
sug.Word = sug.Word[0:len(sug.Word)-size] + "മ"
}
}

func (varnam *Varnam) getUnicodeBlock() unicode.RangeTable {
switch varnam.SchemeDetails.LangCode {
case "kn":
return unicode.RangeTable{R16: []unicode.Range16{{0x0C80, 0x0CFF, 1}}}
case "ml":
return unicode.RangeTable{R16: []unicode.Range16{{0x0D00, 0x0D7F, 1}}}
default:
return unicode.RangeTable{}
}
// TODO add for all languages
}
16 changes: 12 additions & 4 deletions govarnam/symbol.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"fmt"
"log"
"strings"
"unicode"

"github.com/mattn/go-sqlite3"
)
Expand Down Expand Up @@ -274,10 +275,17 @@ func (varnam *Varnam) tokenizeWord(ctx context.Context, word string, matchType i
matches := varnam.findLongestPatternMatchSymbols(ctx, sequence, matchType, acceptCondition)

if len(matches) == 0 {
// No matches, add a character token
// Note that we just add 1 character, and move on
token := Token{VARNAM_TOKEN_CHAR, matches, i, string(sequence[:1])}
results = append(results, token)
if unicode.In(sequence[0], &varnam.LangRules.UnicodeBlock) {
// This helps to get suggestions in inputs like "ആലppu"
character := string(sequence[0])
token := Token{VARNAM_TOKEN_SYMBOL, []Symbol{{Value1: character}}, i, character}
results = append(results, token)
} else {
// No matches, add a character token
// Note that we just add 1 character, and move on
token := Token{VARNAM_TOKEN_CHAR, matches, i, string(sequence[:1])}
results = append(results, token)
}

i++
} else {
Expand Down
2 changes: 1 addition & 1 deletion govarnamgo/govarnamgo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func tearDown() {

func TestMain(m *testing.M) {
var err error
testTempDir, err = os.TempDir("", "govarnam_test")
testTempDir, err = os.MkdirTemp("", "govarnamgo_test")
checkError(err)

setUp("ml")
Expand Down
Loading