From 8bb0f67892bfdcef1897e864281b0bec16e7d233 Mon Sep 17 00:00:00 2001 From: Moeki Kawakami Date: Tue, 26 Sep 2023 16:31:04 +0900 Subject: [PATCH] Add chunks with metadata --- lib/baran/character_text_splitter.rb | 2 +- lib/baran/text_splitter.rb | 6 +++++- test/test_character_text_splitter.rb | 2 +- test/test_recursive_character_text_splitter.rb | 2 +- test/test_text_splitter.rb | 9 +++++++++ 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/lib/baran/character_text_splitter.rb b/lib/baran/character_text_splitter.rb index 7de8729..cf1c7b9 100644 --- a/lib/baran/character_text_splitter.rb +++ b/lib/baran/character_text_splitter.rb @@ -14,4 +14,4 @@ def splitted(text) merged(splits, @separator) end end -end \ No newline at end of file +end diff --git a/lib/baran/text_splitter.rb b/lib/baran/text_splitter.rb index 1f27861..a481d4f 100644 --- a/lib/baran/text_splitter.rb +++ b/lib/baran/text_splitter.rb @@ -26,6 +26,10 @@ def chunks(text) chunks end + def chunks_with_metadata(text:, metadata:) + { chunks: chunks(text), metadata: metadata } + end + def joined(items, separator) text = items.join(separator).strip text.empty? ? nil : text @@ -56,4 +60,4 @@ def merged(splits, separator) results end end -end \ No newline at end of file +end diff --git a/test/test_character_text_splitter.rb b/test/test_character_text_splitter.rb index d6537fc..aa3e6bd 100644 --- a/test/test_character_text_splitter.rb +++ b/test/test_character_text_splitter.rb @@ -13,4 +13,4 @@ def test_chunks assert_equal(chunks.length, 3) end -end \ No newline at end of file +end diff --git a/test/test_recursive_character_text_splitter.rb b/test/test_recursive_character_text_splitter.rb index d411c38..67b002e 100644 --- a/test/test_recursive_character_text_splitter.rb +++ b/test/test_recursive_character_text_splitter.rb @@ -20,4 +20,4 @@ def test_empty_chunks assert_equal(chunks.length, 6) end -end \ No newline at end of file +end diff --git a/test/test_text_splitter.rb b/test/test_text_splitter.rb index 88b3a70..d788ab1 100644 --- a/test/test_text_splitter.rb +++ b/test/test_text_splitter.rb @@ -55,6 +55,15 @@ def test_chunks assert_equal 'text', documents[0][:text] end + def test_chunks_with_metadata + text = 'text one' + metadata = { page: 1 } + documents = @test_splitter.chunks_with_metadata(text: text, metadata: metadata) + + assert_equal 2, documents[:chunks].size + assert_equal({ page: 1 }, documents[:metadata]) + end + def test_joined items = ['one', 'two', 'three'] separator = ' '