Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(contentful): Reimplementation of the stemmer-uk without lookbehind regex. #1289

Merged
merged 3 commits into from
Feb 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions packages/botonic-plugin-contentful/src/nlp/stemmer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ const stemmers = new SingletonMap<Stemmer>({
return new StemmerCs()
},
[locales.UKRAINIAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const StemmerUk = require('@nlpjs/lang-uk/src/stemmer-uk')
// eslint-disable-next-line @typescript-eslint/no-var-requires,node/no-missing-require
const { StemmerUk } = require('./stemmers/stemmer-uk')
return new StemmerUk()
},
[locales.CROATIAN]: () => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ export class StemmerHr implements Stemmer {
private getRoot(token: string): string {
for (const rule of hrRules) {
const match = new RegExp(rule).exec(token)
if (match != undefined) {
if (match) {
const root = match[1]
if (this.containsVocal(root) && root.length > 1) {
return root
Expand Down
89 changes: 89 additions & 0 deletions packages/botonic-plugin-contentful/src/nlp/stemmers/stemmer-uk.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import Stemmer from '@nlpjs/core/src/stemmer'
elozano98 marked this conversation as resolved.
Show resolved Hide resolved
// Replaced the orginal lookbehind regex because they were not supported by Safari

export class StemmerUk implements Stemmer {
public stem(tokens: string[]): string[] {
return tokens.map(token => this.stemToken(token))
}

private stemToken(token: string): string {
const vowelMatch = /[аеиоуюяіїє]/.exec(token)
if (!vowelMatch) {
return token
} else if (vowelMatch.index != undefined) {
const start = token.slice(0, vowelMatch.index + 1)
token = token.slice(vowelMatch.index + 1)

if (token === '') {
return start
}

token = this.step1(token)
token = this.step2(token)
token = this.step3(token)
token = this.step4(token)

return `${start}${token}`
} else {
return token
}
}

private replace(token: string, regex: RegExp, replacement = ''): string {
return token.replace(regex, replacement)
}

private step1(token: string): string {
let originalToken = token
token = this.replace(token, /(?:[иы]в(?:ши(?:сь)?)?)$/)
token = this.replace(token, /(?:а(?:в(?:ши(?:сь)?)?))$/, 'а')
token = this.replace(token, /(?:я(?:в(?:ши(?:сь)?)?))$/, 'я')
if (originalToken === token) {
token = this.replace(token, /с[яьи]$/)
originalToken = token
token = this.replace(
token,
/(?:[аеєуюя]|еє|ем|єє|ий|их|іх|ів|ій|ім|їй|ім|им|ими|іми|йми|ої|ою|ова|ове|ого|ому)$/
)
if (originalToken !== token) {
token = this.replace(token, /(?:[аіу]|ій|ий|им|ім|их|йми|ого|ому|ою)$/)
} else {
originalToken = token
token = this.replace(
token,
/(?:[еєую]|ав|али|ати|вши|ив|ити|ме|сь|ся|ши|учи|яти|ячи|ать|ять)$/g
)
if (originalToken === token) {
token = this.replace(
token,
/(?:[аеєіїийоуыьюя]|ам|ах|ами|ев|еві|еи|ей|ем|ею|єм|єю|ів|їв|ий|ием|ию|ия|иям|иях|ов|ові|ой|ом|ою|ью|ья|ям|ями|ях)$/g
)
}
}
}
return token
}

private step2(token: string): string {
return this.replace(token, /и$/)
}

private step3(token: string): string {
if (
/[^аеиоуюяіїє][аеиоуюяіїє]+[^аеиоуюяіїє]+[аеиоуюяіїє].*oсть/g.exec(token)
) {
token = this.replace(token, /ость$/)
}
return token
}

private step4(token: string): string {
const originalToken = token
token = this.replace(originalToken, /ь$/)
if (originalToken === token) {
token = this.replace(token, /ейше$/)
token = this.replace(token, /нн$/, 'н')
}
return token
}
}
7 changes: 0 additions & 7 deletions packages/botonic-plugin-contentful/src/typings.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -238,13 +238,6 @@ declare module '@nlpjs/lang-uk/src/tokenizer-uk' {
export = TokenizerUk
}

declare module '@nlpjs/lang-uk/src/stemmer-uk' {
import BaseStemmer from '@nlpjs/core/src/base-stemmer'

class StemmerUk extends BaseStemmer {}
export = StemmerUk
}

declare module '@nlpjs/lang-sl/src/tokenizer-sl' {
import Tokenizer from '@nlpjs/core/src/tokenizer'

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { tokenizerPerLocale } from '../../../src/nlp'
import { StemmerUk } from '../../../src/nlp/stemmers/stemmer-uk'

test.each<any>([
['розмовляючи', ['розмовляюч']],
['говорити', ['говор']],
elozano98 marked this conversation as resolved.
Show resolved Hide resolved
['парковка', ['парковк']],
['експеримент', ['експеримент']],
['зустрічі', ['зустріч']],
['потурбувавши', ['потурбува']], //Solved without lookbehind regex
])('TEST: Ukrainian stemmer("%s")->"%s"', (word: string, expected: string) => {
const tokenizer = tokenizerPerLocale('hr')
const sut = new StemmerUk()
const result = sut.stem(tokenizer.tokenize(word, false))
expect(result).toEqual(expected)
})