From e04c7ae145969020c805de6cc99c5a432d24c834 Mon Sep 17 00:00:00 2001 From: Scott Watermasysk Date: Thu, 10 Oct 2024 14:40:42 -0400 Subject: [PATCH] Handles text that does not end in a white space for SentenceTextSpliter --- lib/baran/sentence_text_splitter.rb | 2 +- test/test_sentence_text_spliter.rb | 48 +++++++++++++++++------------ 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/lib/baran/sentence_text_splitter.rb b/lib/baran/sentence_text_splitter.rb index 73eb7d0..335260f 100644 --- a/lib/baran/sentence_text_splitter.rb +++ b/lib/baran/sentence_text_splitter.rb @@ -8,7 +8,7 @@ def initialize(chunk_size: 1024, chunk_overlap: 64) def splitted(text) # Use a regex to split text based on the specified sentence-ending characters followed by whitespace - text.scan(/[^.!?]+[.!?]+(?:\s+)/).map(&:strip) + text.scan(/[^.!?]+[.!?]+(?:\s+|\z)/).map(&:strip) end end end diff --git a/test/test_sentence_text_spliter.rb b/test/test_sentence_text_spliter.rb index 1b3220b..86d5778 100644 --- a/test/test_sentence_text_spliter.rb +++ b/test/test_sentence_text_spliter.rb @@ -6,10 +6,7 @@ class TestSentenceTextSplitter < MiniTest::Unit::TestCase def setup @splitter = Baran::SentenceTextSplitter.new(chunk_size: 10, chunk_overlap: 5) - end - - def test_chunks - story = <<~TEXT + @story = <<~TEXT Hack and jill went up the hill to fetch a pail of water. Jack fell @@ -19,23 +16,36 @@ def test_chunks No, the water was splashed on Bo Peep. TEXT - chunks = @splitter.chunks(story) - - sentences = chunks - .map { |chunk| - chunk[:text] - .gsub(/\s+/, ' ') - .strip - } - - expected = [ - "Hack and jill went up the hill to fetch a pail of water.", - "Jack fell down and broke his crown and Jill came tumbling after.", - "The pail went flying!", - "Was the water spilled?", + @expected =[ + "Hack and jill went up the hill to fetch a pail of water.", + "Jack fell down and broke his crown and Jill came tumbling after.", + "The pail went flying!", + "Was the water spilled?", "No, the water was splashed on Bo Peep." ] + end + + def test_chunks + chunks = @splitter.chunks(@story) + sentences = format_chunks(chunks) + assert_equal(sentences, @expected) + end + + def test_chunks_without_trailing_whitespace + chunks = @splitter.chunks(@story.strip) + sentences = format_chunks(chunks) + assert_equal(sentences, @expected) + end + + + private - assert_equal(sentences, expected) + def format_chunks(chunks) + chunks + .map { |chunk| + chunk[:text] + .gsub(/\s+/, ' ') + .strip + } end end