oramasearch · micheleriva · Jan 19, 2023 · Jan 17, 2023 · Jan 17, 2023 · Jan 19, 2023
diff --git a/README.md b/README.md
@@ -353,7 +353,7 @@ async function main() {
 main();
 ```
 
-Right now, Lyra supports 23 languages and stemmers out of the box:
+Right now, Lyra supports 24 languages and stemmers out of the box:
 
 - Armenian
 - Arabic
@@ -378,6 +378,7 @@ Right now, Lyra supports 23 languages and stemmers out of the box:
 - Serbian
 - Swedish
 - Turkish
+- Ukrainian
 
 ## Hooks
 

diff --git a/src/tokenizer/index.ts b/src/tokenizer/index.ts
@@ -59,6 +59,7 @@ const splitRegex: Record<Language, RegExp> = {
   armenian: /[^a-z0-9ա-ֆ]+/gim,
   greek: /[^a-z0-9α-ωά-ώ]+/gim,
   indonesian: /[^a-z0-9]+/gim,
+  ukrainian:/[^a-z0-9а-яА-ЯіїєІЇЄ]+/gim
 };
 
 export const normalizationCache = new Map();

diff --git a/src/tokenizer/languages.ts b/src/tokenizer/languages.ts
@@ -25,4 +25,5 @@ export const SUPPORTED_LANGUAGES = [
   "romanian",
   "serbian",
   "turkish",
+  "ukrainian"
 ];
diff --git a/stemmer/lib/uk.d.ts b/stemmer/lib/uk.d.ts
@@ -0,0 +1 @@
+export declare function stemmer(word: string): string;
diff --git a/stemmer/lib/uk.js b/stemmer/lib/uk.js
@@ -0,0 +1,100 @@
+/*@
+
+  Russian stemming algorithm provided by Dr Martin Porter (snowball.tartarus.org):
+  http://snowball.tartarus.org/algorithms/russian/stemmer.html
+
+  Algorithm implementation in PHP provided by Dmitry Koterov (dklab.ru):
+  http://forum.dklab.ru/php/advises/HeuristicWithoutTheDictionaryExtractionOfARootFromRussianWord.html
+
+  Algorithm implementation adopted for Drupal by Algenon (4algenon@gmail.com):
+  https://drupal.org/project/ukstemmer
+
+  Algorithm implementation in Node by Zakharov Kyrylo
+  https://github.com/Amice13
+
+*/
+
+const vowel = new RegExp('[аеиоуюяіїє]')
+
+const perfectiveGround = new RegExp('(?:[иы]в(?:ши(?:сь)?)?|(?<=[ая])(?:в(?:ши(?:сь)?)?))$')
+
+// http://uk.wikipedia.org/wiki/Рефлексивне_дієслово
+const reflexive = new RegExp('с[яьи]$')
+
+// http://uk.wikipedia.org/wiki/Прикметник + http://wapedia.mobi/uk/Прикметник
+const adjective = new RegExp('(?:[аеєуюя]|еє|ем|єє|ий|их|іх|ів|ій|ім|їй|ім|им|ими|іми|йми|ої|ою|ова|ове|ого|ому)$')
+
+// http://uk.wikipedia.org/wiki/Дієприкметник
+const participle = new RegExp('(?:[аіу]|ій|ий|им|ім|их|йми|ого|ому|ою)$')
+
+// http://uk.wikipedia.org/wiki/Дієслово
+const verb = new RegExp('(?:[еєую]|ав|али|ати|вши|ив|ити|ме|сь|ся|ши|учи|яти|ячи|ать|ять)$', 'g')
+
+// http://uk.wikipedia.org/wiki/Іменник
+const noun = new RegExp('(?:[аеєіїийоуыьюя]|ам|ах|ами|ев|еві|еи|ей|ем|ею|єм|єю|ів|їв|ий|ием|ию|ия|иям|иях|ов|ові|ой|ом|ою|ью|ья|ям|ями|ях)$', 'g')
+
+const derivational = new RegExp('[^аеиоуюяіїє][аеиоуюяіїє]+[^аеиоуюяіїє]+[аеиоуюяіїє].*(?<=о)сть?$', 'g')
+
+const step2 = new RegExp('и$')
+const step3 = new RegExp('ость$')
+const step41 = new RegExp('ь$')
+const step42 = new RegExp('ейше$')
+const step43 = new RegExp('нн$')
+
+const alterations = new RegExp('([гджзкстхцчш]|ст|дж|ждж|ьц|сі|ці|зі|он|ін|ів|ев|ок|шк)$')
+
+let thisString
+
+const ukrstemmer = (string, strict = false) => {
+
+  thisString = string.toLowerCase()
+  let wordStartIndex = string.match(vowel)
+  if (wordStartIndex === null) return string
+  wordStartIndex = wordStartIndex.index
+  let wordStart = thisString.slice(0, wordStartIndex + 1)
+  thisString = thisString.slice(wordStartIndex + 1)
+  if (thisString === '') return string
+
+  // Step 1
+  if (!replaceAndCheck(thisString, perfectiveGround, '')) {
+    replaceAndCheck(thisString, reflexive, '')
+    if (replaceAndCheck(thisString, adjective, '')) {
+      replaceAndCheck(thisString, participle, '')
+    } else {
+      if (!replaceAndCheck(thisString, verb, '')) {
+        replaceAndCheck(thisString, noun, '')
+      }
+    }
+  }
+
+  // Step 2
+  replaceAndCheck(thisString, step2, '')
+
+  // Step 3
+  if (derivational.test(thisString)) {
+    replaceAndCheck(thisString, step3, '')
+  }
+
+  // Step 4
+  if (!replaceAndCheck(thisString, step41, '')) {
+    replaceAndCheck(thisString, step42, '')
+    replaceAndCheck(thisString, step43, 'н')
+  }
+
+  if (strict) {
+    replaceAndCheck(thisString, alterations, '')
+  }
+
+  return wordStart + thisString
+}
+
+const replaceAndCheck = (s, from, to) => {
+  let original = s
+  thisString = s.replace(from, to) 
+  return thisString !== original
+}
+
+export function stemmer(word){
+  return ukrstemmer(word)
+}
+
diff --git a/tests/tokenizer.test.ts b/tests/tokenizer.test.ts
@@ -14,9 +14,10 @@ import { stemmer as NLStemmer } from "../stemmer/lib/nl.js";
 import { stemmer as DEStemmer } from "../stemmer/lib/de.js";
 import { stemmer as FIStemmer } from "../stemmer/lib/fi.js";
 import { stemmer as DKStemmer } from "../stemmer/lib/dk.js";
+import { stemmer as UKStemmer } from "../stemmer/lib/uk.js";
 
 t.test("Tokenizer", t => {
-  t.plan(13);
+  t.plan(14);
 
   t.test("Should tokenize and stem correctly in english", t => {
     t.plan(2);
@@ -362,6 +363,35 @@ t.test("Tokenizer", t => {
     t.strictSame(O1, ["sovn", "svar", "ting", "prov", "mislyk"]);
     t.strictSame(O2, ["bagt", "smakag"]);
   });
+  t.test("Should tokenize and stem correctly in ukrainian", t => {
+    t.plan(2);
+
+    const I1 = "Коли тести не проходять, спати важко";
+    const I2 = "я приготувала тістечка";
+
+    const O1 = tokenize(
+      I1,
+      "ukrainian",
+      false,
+      defaultTokenizerConfig("ukrainian", {
+        stemmingFn: UKStemmer,
+        customStopWords:[]
+
+      }),
+    );
+    const O2 = tokenize(
+      I2,
+      "ukrainian",
+      false,
+      defaultTokenizerConfig("ukrainian", {
+        stemmingFn: UKStemmer,
+        customStopWords:[]
+      }),
+    );
+    t.strictSame(O1, ["кол", "тест", "не", "проход", "спат","важк"]);
+    t.strictSame(O2, ["я", "приготувал","тістечк"]);
+  });
+
 });
 
 t.test("Custom stop-words rules", t => {
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		export declare function stemmer(word: string): string;