Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ukrainian language #252

Merged
merged 4 commits into from
Jan 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ async function main() {
main();
```

Right now, Lyra supports 23 languages and stemmers out of the box:
Right now, Lyra supports 24 languages and stemmers out of the box:

- Armenian
- Arabic
Expand All @@ -378,6 +378,7 @@ Right now, Lyra supports 23 languages and stemmers out of the box:
- Serbian
- Swedish
- Turkish
- Ukrainian

## Hooks

Expand Down
1 change: 1 addition & 0 deletions src/tokenizer/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ const splitRegex: Record<Language, RegExp> = {
armenian: /[^a-z0-9ա-ֆ]+/gim,
greek: /[^a-z0-9α-ωά-ώ]+/gim,
indonesian: /[^a-z0-9]+/gim,
ukrainian:/[^a-z0-9а-яА-ЯіїєІЇЄ]+/gim
};

export const normalizationCache = new Map();
Expand Down
1 change: 1 addition & 0 deletions src/tokenizer/languages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ export const SUPPORTED_LANGUAGES = [
"romanian",
"serbian",
"turkish",
"ukrainian"
];
1 change: 1 addition & 0 deletions stemmer/lib/uk.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export declare function stemmer(word: string): string;
100 changes: 100 additions & 0 deletions stemmer/lib/uk.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*@

Russian stemming algorithm provided by Dr Martin Porter (snowball.tartarus.org):
http://snowball.tartarus.org/algorithms/russian/stemmer.html

Algorithm implementation in PHP provided by Dmitry Koterov (dklab.ru):
http://forum.dklab.ru/php/advises/HeuristicWithoutTheDictionaryExtractionOfARootFromRussianWord.html

Algorithm implementation adopted for Drupal by Algenon (4algenon@gmail.com):
https://drupal.org/project/ukstemmer

Algorithm implementation in Node by Zakharov Kyrylo
https://github.com/Amice13

*/

const vowel = new RegExp('[аеиоуюяіїє]')

const perfectiveGround = new RegExp('(?:[иы]в(?:ши(?:сь)?)?|(?<=[ая])(?:в(?:ши(?:сь)?)?))$')

// http://uk.wikipedia.org/wiki/Рефлексивне_дієслово
const reflexive = new RegExp('с[яьи]$')

// http://uk.wikipedia.org/wiki/Прикметник + http://wapedia.mobi/uk/Прикметник
const adjective = new RegExp('(?:[аеєуюя]|еє|ем|єє|ий|их|іх|ів|ій|ім|їй|ім|им|ими|іми|йми|ої|ою|ова|ове|ого|ому)$')

// http://uk.wikipedia.org/wiki/Дієприкметник
const participle = new RegExp('(?:[аіу]|ій|ий|им|ім|их|йми|ого|ому|ою)$')

// http://uk.wikipedia.org/wiki/Дієслово
const verb = new RegExp('(?:[еєую]|ав|али|ати|вши|ив|ити|ме|сь|ся|ши|учи|яти|ячи|ать|ять)$', 'g')

// http://uk.wikipedia.org/wiki/Іменник
const noun = new RegExp('(?:[аеєіїийоуыьюя]|ам|ах|ами|ев|еві|еи|ей|ем|ею|єм|єю|ів|їв|ий|ием|ию|ия|иям|иях|ов|ові|ой|ом|ою|ью|ья|ям|ями|ях)$', 'g')

const derivational = new RegExp('[^аеиоуюяіїє][аеиоуюяіїє]+[^аеиоуюяіїє]+[аеиоуюяіїє].*(?<=о)сть?$', 'g')

const step2 = new RegExp('и$')
const step3 = new RegExp('ость$')
const step41 = new RegExp('ь$')
const step42 = new RegExp('ейше$')
const step43 = new RegExp('нн$')

const alterations = new RegExp('([гджзкстхцчш]|ст|дж|ждж|ьц|сі|ці|зі|он|ін|ів|ев|ок|шк)$')

let thisString

const ukrstemmer = (string, strict = false) => {

thisString = string.toLowerCase()
let wordStartIndex = string.match(vowel)
if (wordStartIndex === null) return string
wordStartIndex = wordStartIndex.index
let wordStart = thisString.slice(0, wordStartIndex + 1)
thisString = thisString.slice(wordStartIndex + 1)
if (thisString === '') return string

// Step 1
if (!replaceAndCheck(thisString, perfectiveGround, '')) {
replaceAndCheck(thisString, reflexive, '')
if (replaceAndCheck(thisString, adjective, '')) {
replaceAndCheck(thisString, participle, '')
} else {
if (!replaceAndCheck(thisString, verb, '')) {
replaceAndCheck(thisString, noun, '')
}
}
}

// Step 2
replaceAndCheck(thisString, step2, '')

// Step 3
if (derivational.test(thisString)) {
replaceAndCheck(thisString, step3, '')
}

// Step 4
if (!replaceAndCheck(thisString, step41, '')) {
replaceAndCheck(thisString, step42, '')
replaceAndCheck(thisString, step43, 'н')
}

if (strict) {
replaceAndCheck(thisString, alterations, '')
}

return wordStart + thisString
}

const replaceAndCheck = (s, from, to) => {
let original = s
thisString = s.replace(from, to)
return thisString !== original
}

export function stemmer(word){
return ukrstemmer(word)
}

32 changes: 31 additions & 1 deletion tests/tokenizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ import { stemmer as NLStemmer } from "../stemmer/lib/nl.js";
import { stemmer as DEStemmer } from "../stemmer/lib/de.js";
import { stemmer as FIStemmer } from "../stemmer/lib/fi.js";
import { stemmer as DKStemmer } from "../stemmer/lib/dk.js";
import { stemmer as UKStemmer } from "../stemmer/lib/uk.js";

t.test("Tokenizer", t => {
t.plan(13);
t.plan(14);

t.test("Should tokenize and stem correctly in english", t => {
t.plan(2);
Expand Down Expand Up @@ -362,6 +363,35 @@ t.test("Tokenizer", t => {
t.strictSame(O1, ["sovn", "svar", "ting", "prov", "mislyk"]);
t.strictSame(O2, ["bagt", "smakag"]);
});
t.test("Should tokenize and stem correctly in ukrainian", t => {
t.plan(2);

const I1 = "Коли тести не проходять, спати важко";
const I2 = "я приготувала тістечка";

const O1 = tokenize(
I1,
"ukrainian",
false,
defaultTokenizerConfig("ukrainian", {
stemmingFn: UKStemmer,
customStopWords:[]

}),
);
const O2 = tokenize(
I2,
"ukrainian",
false,
defaultTokenizerConfig("ukrainian", {
stemmingFn: UKStemmer,
customStopWords:[]
}),
);
t.strictSame(O1, ["кол", "тест", "не", "проход", "спат","важк"]);
t.strictSame(O2, ["я", "приготувал","тістечк"]);
});

});

t.test("Custom stop-words rules", t => {
Expand Down