From 29a8c7b7890d7d796b7cca29361fd517efef87dc Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Mon, 20 Nov 2023 23:41:44 +0530 Subject: [PATCH] Add support to show suggestions for inputs that already have native language characters --- govarnam/constants.go | 2 +- govarnam/govarnam.go | 4 +++- govarnam/govarnam_ml_test.go | 13 +++++++++++++ .../{govarnam_ml.go => lang_specific_rules.go} | 14 ++++++++++++++ govarnam/symbol.go | 16 ++++++++++++---- govarnamgo/govarnamgo_test.go | 2 +- 6 files changed, 44 insertions(+), 7 deletions(-) rename govarnam/{govarnam_ml.go => lang_specific_rules.go} (69%) diff --git a/govarnam/constants.go b/govarnam/constants.go index 01b0258..5a47448 100644 --- a/govarnam/constants.go +++ b/govarnam/constants.go @@ -98,7 +98,7 @@ func getVSTLookupDirs() []string { } } -//FindVSTDir Get the VST storing directory +// FindVSTDir Get the VST storing directory func FindVSTDir() (string, error) { for _, loc := range getVSTLookupDirs() { if dirExists(loc) { diff --git a/govarnam/govarnam.go b/govarnam/govarnam.go index 2928341..2d50876 100644 --- a/govarnam/govarnam.go +++ b/govarnam/govarnam.go @@ -14,6 +14,7 @@ import ( "sort" "strings" "time" + "unicode" "unicode/utf8" // sqlite3 @@ -25,6 +26,7 @@ type LangRules struct { Virama string IndicDigits bool PatternLongestLength int // Longest length of pattern in VST + UnicodeBlock unicode.RangeTable } // SchemeDetails of VST @@ -247,8 +249,8 @@ func (varnam *Varnam) setDefaultConfig() { varnam.DictionaryMatchExact = false varnam.LangRules.IndicDigits = false - varnam.LangRules.Virama, _ = varnam.getVirama() + varnam.LangRules.UnicodeBlock = varnam.getUnicodeBlock() if varnam.SchemeDetails.LangCode == "ml" { varnam.RegisterPatternWordPartializer(varnam.mlPatternWordPartializer) diff --git a/govarnam/govarnam_ml_test.go b/govarnam/govarnam_ml_test.go index 877de53..8dda439 100644 --- a/govarnam/govarnam_ml_test.go +++ b/govarnam/govarnam_ml_test.go @@ -478,6 +478,7 @@ func TestMLRecentlyLearnedWords(t *testing.T) { } result, err = varnam.GetRecentlyLearntWords(context.Background(), 4, len(words)) + checkError(err) assertEqual(t, result[0].Word, "ആലപ്പുഴ") } @@ -495,3 +496,15 @@ func TestMLGetSuggestions(t *testing.T) { assertEqual(t, result[0].Word, "ആലപ്പുഴ") } + +func TestMLNativePartialWordsInInput(t *testing.T) { + varnam := getVarnamInstance("ml") + + words := []string{"ആലപ്പുഴ", "പുസ്തകം"} + for _, word := range words { + varnam.Learn(word, 0) + } + + assertEqual(t, varnam.TransliterateAdvanced("ആലppu").DictionarySuggestions[0].Word, "ആലപ്പുഴ") + assertEqual(t, varnam.TransliterateAdvanced("പുസ്തkam").DictionarySuggestions[0].Word, "പുസ്തകം") +} diff --git a/govarnam/govarnam_ml.go b/govarnam/lang_specific_rules.go similarity index 69% rename from govarnam/govarnam_ml.go rename to govarnam/lang_specific_rules.go index 632a4d5..6bf0f61 100644 --- a/govarnam/govarnam_ml.go +++ b/govarnam/lang_specific_rules.go @@ -1,5 +1,7 @@ package govarnam +import "unicode" + /** * govarnam - An Indian language transliteration library * Copyright Subin Siby , 2021 @@ -23,3 +25,15 @@ func (varnam *Varnam) mlPatternWordPartializer(sug *Suggestion) { sug.Word = sug.Word[0:len(sug.Word)-size] + "മ" } } + +func (varnam *Varnam) getUnicodeBlock() unicode.RangeTable { + switch varnam.SchemeDetails.LangCode { + case "kn": + return unicode.RangeTable{R16: []unicode.Range16{{0x0C80, 0x0CFF, 1}}} + case "ml": + return unicode.RangeTable{R16: []unicode.Range16{{0x0D00, 0x0D7F, 1}}} + default: + return unicode.RangeTable{} + } + // TODO add for all languages +} diff --git a/govarnam/symbol.go b/govarnam/symbol.go index a02d617..a92cdd1 100644 --- a/govarnam/symbol.go +++ b/govarnam/symbol.go @@ -12,6 +12,7 @@ import ( "fmt" "log" "strings" + "unicode" "github.com/mattn/go-sqlite3" ) @@ -274,10 +275,17 @@ func (varnam *Varnam) tokenizeWord(ctx context.Context, word string, matchType i matches := varnam.findLongestPatternMatchSymbols(ctx, sequence, matchType, acceptCondition) if len(matches) == 0 { - // No matches, add a character token - // Note that we just add 1 character, and move on - token := Token{VARNAM_TOKEN_CHAR, matches, i, string(sequence[:1])} - results = append(results, token) + if unicode.In(sequence[0], &varnam.LangRules.UnicodeBlock) { + // This helps to get suggestions in inputs like "ആലppu" + character := string(sequence[0]) + token := Token{VARNAM_TOKEN_SYMBOL, []Symbol{{Value1: character}}, i, character} + results = append(results, token) + } else { + // No matches, add a character token + // Note that we just add 1 character, and move on + token := Token{VARNAM_TOKEN_CHAR, matches, i, string(sequence[:1])} + results = append(results, token) + } i++ } else { diff --git a/govarnamgo/govarnamgo_test.go b/govarnamgo/govarnamgo_test.go index f75e8f6..fb32587 100644 --- a/govarnamgo/govarnamgo_test.go +++ b/govarnamgo/govarnamgo_test.go @@ -67,7 +67,7 @@ func tearDown() { func TestMain(m *testing.M) { var err error - testTempDir, err = os.TempDir("", "govarnam_test") + testTempDir, err = os.MkdirTemp("", "govarnamgo_test") checkError(err) setUp("ml")