Skip to content

Commit

Permalink
chore(lyra): export tokenizer function
Browse files Browse the repository at this point in the history
  • Loading branch information
micheleriva committed Aug 26, 2022
1 parent 99eb188 commit 3e6df9a
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/lyra.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { trackInsertion } from "./insertion-checker";
type Index = Record<string, Node>;

export { formatNanoseconds } from "./utils";
export { tokenize } from "./tokenizer";

export type PropertyType = "string" | "number" | "boolean";

Expand Down
10 changes: 8 additions & 2 deletions src/tokenizer/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,21 @@ const splitRegex: Record<Language, RegExp> = {
swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim,
};

export function tokenize(input: string, language: Language = "english") {
export function tokenize(input: string, language: Language = "english", allowDuplicates = false) {
/* c8 ignore next 3 */
if (typeof input !== "string") {
return [input];
}

const splitRule = splitRegex[language];
const tokens = input.toLowerCase().split(splitRule).map(replaceDiacritics);
return Array.from(new Set(trim(tokens)));
const trimTokens = trim(tokens);

if (!allowDuplicates) {
return Array.from(new Set(trimTokens));
}

return trimTokens;
}

function trim(text: string[]): string[] {
Expand Down
18 changes: 18 additions & 0 deletions tap-snapshots/tests/tokenizer.test.ts.test.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,24 @@ Array [
]
`

exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english and allow duplicates > Should tokenize and stem correctly in english and allow duplicates-O1 1`] = `
Array [
"this",
"is",
"a",
"test",
"with",
"duplicates",
]
`

exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in english and allow duplicates > Should tokenize and stem correctly in english and allow duplicates-O2 1`] = `
Array [
"it's",
"alive",
]
`

exports[`tests/tokenizer.test.ts TAP Tokenizer Should tokenize and stem correctly in french > Should tokenize and stem correctly in french-O1 1`] = `
Array [
"voyons",
Expand Down
15 changes: 14 additions & 1 deletion tests/tokenizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import t from "tap";
import { tokenize } from "../src/tokenizer";

t.test("Tokenizer", t => {
t.plan(9);
t.plan(10);

t.test("Should tokenize and stem correctly in english", t => {
t.plan(2);
Expand All @@ -17,6 +17,19 @@ t.test("Tokenizer", t => {
t.matchSnapshot(O2, `${t.name}-O2`);
});

t.test("Should tokenize and stem correctly in english and allow duplicates", t => {
t.plan(2);

const I1 = "this is a test with test duplicates";
const I2 = "it's alive! it's alive!";

const O1 = tokenize(I1, "english");
const O2 = tokenize(I2, "english");

t.matchSnapshot(O1, `${t.name}-O1`);
t.matchSnapshot(O2, `${t.name}-O2`);
});

t.test("Should tokenize and stem correctly in french", t => {
t.plan(2);

Expand Down

0 comments on commit 3e6df9a

Please sign in to comment.