Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add Bulgarian language stemmer #291

Merged
merged 2 commits into from
Feb 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions stemmer/lib/bg.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export declare function stemmer(word: string): string;
159 changes: 159 additions & 0 deletions stemmer/lib/bg.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/**
* Light Stemmer for Bulgarian.
*
* Converted from the Java implementation of Apache {@link https://github.com/apache/lucene | Lucene}.
*
* Implements the algorithm described in: *Searching Strategies for the Bulgarian Language*
* {@link http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf}
*/
class BulgarianStemmer {
stem(s) {
const calculatedLength = this.calculateStemLength(s, s.length);
return s.substring(0, calculatedLength);
}

/**
* Stem an input buffer of Bulgarian text.
*
* @param {string} s input buffer
* @param {number} len length of input buffer
* @return {number} length of input buffer after normalization
*/
calculateStemLength(s, len) {
if (len < 4) {
// do not stem
return len;
}

if (len > 5 && this.endsWith(s, len, "ища")) {
return len - 3;
}

len = this.removeArticle(s, len);
len = this.removePlural(s, len);

if (len > 3) {
if (this.endsWith(s, len, "я")) {
len--;
}
if (this.endsWith(s, len, "а") || this.endsWith(s, len, "о") || this.endsWith(s, len, "е")) {
len--;
}
}

// the rule to rewrite ен -> н is duplicated in the paper.
// in the perl implementation referenced by the paper, this is fixed.
// (it is fixed here as well)
if (len > 4 && this.endsWith(s, len, "ен")) {
s[len - 2] = "н"; // replace with н
len--;
}

if (len > 5 && s[len - 2] == "ъ") {
s[len - 2] = s[len - 1]; // replace ъN with N
len--;
}

return len;
}

/**
* Mainly remove the definite article
*
* @param {string} s input buffer
* @param {number} len length of input buffer
* @return {number} new stemmed length
*/
removeArticle(s, len) {
if (len > 6 && this.endsWith(s, len, "ият")) return len - 3;

if (len > 5) {
if (
this.endsWith(s, len, "ът") ||
this.endsWith(s, len, "то") ||
this.endsWith(s, len, "те") ||
this.endsWith(s, len, "та") ||
this.endsWith(s, len, "ия")
) {
return len - 2;
}
}

if (len > 4 && this.endsWith(s, len, "ят")) {
return len - 2;
}

return len;
}

/**
* Remove the plural from the input string
*
* @param {string} s input buffer
* @param {number} len length of input buffer
* @return {number} new stemmed length
*/
removePlural(s, len) {
if (len > 6) {
if (this.endsWith(s, len, "овци")) return len - 3; // replace with о
if (this.endsWith(s, len, "ове")) return len - 3;
if (this.endsWith(s, len, "еве")) {
s[len - 3] = "й"; // replace with й
return len - 2;
}
}

if (len > 5) {
if (this.endsWith(s, len, "ища")) return len - 3;
if (this.endsWith(s, len, "та")) return len - 2;
if (this.endsWith(s, len, "ци")) {
s[len - 2] = "к"; // replace with к
return len - 1;
}
if (this.endsWith(s, len, "зи")) {
s[len - 2] = "г"; // replace with г
return len - 1;
}

if (s[len - 3] == "е" && s[len - 1] == "и") {
s[len - 3] = "я"; // replace е with я, remove и
return len - 1;
}
}

if (len > 4) {
if (this.endsWith(s, len, "си")) {
s[len - 2] = "х"; // replace with х
return len - 1;
}
if (this.endsWith(s, len, "и")) return len - 1;
}

return len;
}

/**
* Returns true if the character array ends with the suffix.
*
* This is a helper function for the stemmer from the original Java implementation.
* {@link https://github.com/apache/lucene/blob/main/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java#L68}
*
* @param {string} s Input Buffer
* @param {number} len length of input buffer
* @param {string} suffix Suffix string to test
* @return true if `s` ends with `suffix`
*/
endsWith(s, len, suffix) {
let suffixLen = suffix.length;
if (suffixLen > len) return false;
for (let i = suffixLen - 1; i >= 0; i--) if (s[len - (suffixLen - i)] != suffix[i]) return false;

return true;
}
}

const stemmerInstance = new BulgarianStemmer();

export function stemmer(word) {
return stemmerInstance.stem(word);
}
40 changes: 34 additions & 6 deletions tests/tokenizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ import { stemmer as DEStemmer } from "../stemmer/lib/de.js";
import { stemmer as FIStemmer } from "../stemmer/lib/fi.js";
import { stemmer as DKStemmer } from "../stemmer/lib/dk.js";
import { stemmer as UKStemmer } from "../stemmer/lib/uk.js";
import { stemmer as BGStemmer } from "../stemmer/lib/bg.js";

t.test("Tokenizer", t => {
t.plan(14);
t.plan(15);

t.test("Should tokenize and stem correctly in english", t => {
t.plan(2);
Expand Down Expand Up @@ -363,6 +364,7 @@ t.test("Tokenizer", t => {
t.strictSame(O1, ["sovn", "svar", "ting", "prov", "mislyk"]);
t.strictSame(O2, ["bagt", "smakag"]);
});

t.test("Should tokenize and stem correctly in ukrainian", t => {
t.plan(2);

Expand All @@ -375,8 +377,7 @@ t.test("Tokenizer", t => {
false,
defaultTokenizerConfig("ukrainian", {
stemmingFn: UKStemmer,
customStopWords:[]

customStopWords: [],
}),
);
const O2 = tokenize(
Expand All @@ -385,13 +386,40 @@ t.test("Tokenizer", t => {
false,
defaultTokenizerConfig("ukrainian", {
stemmingFn: UKStemmer,
customStopWords:[]
customStopWords: [],
}),
);
t.strictSame(O1, ["кол", "тест", "не", "проход", "спат","важк"]);
t.strictSame(O2, ["я", "приготувал","тістечк"]);
t.strictSame(O1, ["кол", "тест", "не", "проход", "спат", "важк"]);
t.strictSame(O2, ["я", "приготувал", "тістечк"]);
});

t.test("Should tokenize and stem correctly in bulgarian", t => {
t.plan(2);

const I1 = "Кокошката е малка крава която не може да се събере с теста";
const I2 = "Има първа вероятност да се случи нещо неочаквано докато се изпълняват тестовете";

const O1 = tokenize(
I1,
"bulgarian",
false,
defaultTokenizerConfig("bulgarian", {
stemmingFn: BGStemmer,
customStopWords: [],
}),
);
const O2 = tokenize(
I2,
"bulgarian",
true,
defaultTokenizerConfig("bulgarian", {
stemmingFn: BGStemmer,
customStopWords: [],
}),
);
t.strictSame(O1, ["кокошк", "е", "малк", "крав", "коят", "не", "мож", "да", "се", "събер", "с", "тест"]);
t.strictSame(O2, ["има", "първ", "вероятност", "да", "се", "случ", "нещ", "неочакван", "док", "се", "изпълняват", "тест"]);
});
});

t.test("Custom stop-words rules", t => {
Expand Down