Skip to content

Commit

Permalink
fix: handling errors in splitBySentenceTokenizer (run-llama#1087)
Browse files Browse the repository at this point in the history
Co-authored-by: Alex Yang <himself65@outlook.com>
  • Loading branch information
marcusschiesser and himself65 authored Jul 30, 2024
1 parent da5cfc4 commit 0452af9
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 3 deletions.
6 changes: 6 additions & 0 deletions .changeset/thin-pens-deliver.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@llamaindex/core": patch
"@llamaindex/core-tests": patch
---

fix: handling errors in splitBySentenceTokenizer
6 changes: 5 additions & 1 deletion packages/core/src/node-parser/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,11 @@ export const splitBySentenceTokenizer = (): TextSplitterFn => {
}
const tokenizer = sentenceTokenizer;
return (text: string) => {
return tokenizer.tokenize(text);
try {
return tokenizer.tokenize(text);
} catch {
return [text];
}
};
};

Expand Down
14 changes: 12 additions & 2 deletions packages/core/tests/node-parser/text-splitter.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import { SentenceSplitter } from "@llamaindex/core/node-parser";
import {
SentenceSplitter,
splitBySentenceTokenizer,
} from "@llamaindex/core/node-parser";
import { describe, expect, test } from "vitest";

describe("SentenceSplitter", () => {
describe("sentence splitter", () => {
test("initializes", () => {
const sentenceSplitter = new SentenceSplitter();
expect(sentenceSplitter).toBeDefined();
Expand Down Expand Up @@ -105,4 +108,11 @@ describe("SentenceSplitter", () => {
"因为他照了人类,连我都在内。",
]);
});

test("issue 1087 - edge case when input with brackets", () => {
const text =
"A card must be of uniform thickness and made of unfolded and uncreased paper or cardstock of approximately the quality and weight of a stamped card (i.e., a card available from USPS).";
const split = splitBySentenceTokenizer();
expect(split(text)).toEqual([text]);
});
});

0 comments on commit 0452af9

Please sign in to comment.