diff --git a/Gemfile.lock b/Gemfile.lock index 80e785c50..fb5895439 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -2,6 +2,7 @@ PATH remote: . specs: langchainrb (0.5.5) + baran (~> 0.1.6) colorize (~> 0.8.1) tiktoken_ruby (~> 0.0.5) @@ -32,6 +33,7 @@ GEM afm (0.2.2) ai21 (0.2.0) ast (2.4.2) + baran (0.1.6) builder (3.2.4) byebug (11.1.3) childprocess (4.1.0) diff --git a/langchain.gemspec b/langchain.gemspec index 197cd7d68..b171e0283 100644 --- a/langchain.gemspec +++ b/langchain.gemspec @@ -32,8 +32,9 @@ Gem::Specification.new do |spec| # dependencies # Not sure if we should require this as it only applies to OpenAI usecase. - spec.add_dependency "tiktoken_ruby", "~> 0.0.5" + spec.add_dependency "baran", "~> 0.1.6" spec.add_dependency "colorize", "~> 0.8.1" + spec.add_dependency "tiktoken_ruby", "~> 0.0.5" # development dependencies spec.add_development_dependency "dotenv-rails", "~> 2.7.6" diff --git a/lib/langchain.rb b/lib/langchain.rb index 65416ea90..29da5920f 100644 --- a/lib/langchain.rb +++ b/lib/langchain.rb @@ -78,6 +78,11 @@ module Agent autoload :SQLQueryAgent, "langchain/agent/sql_query_agent/sql_query_agent.rb" end + module Chunker + autoload :Base, "langchain/chunker/base" + autoload :Text, "langchain/chunker/text" + end + module Tool autoload :Base, "langchain/tool/base" autoload :Calculator, "langchain/tool/calculator" diff --git a/lib/langchain/chunker/base.rb b/lib/langchain/chunker/base.rb new file mode 100644 index 000000000..a6c56d00c --- /dev/null +++ b/lib/langchain/chunker/base.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +module Langchain + module Chunker + # = Chunkers + # Chunkers are used to split documents into smaller chunks before indexing into vector search databases. + # Otherwise large documents, when retrieved and passed to LLMs, may hit the context window limits. + # + # == Available chunkers + # + # - {Langchain::Chunker::Text} + class Base + end + end +end diff --git a/lib/langchain/chunker/text.rb b/lib/langchain/chunker/text.rb new file mode 100644 index 000000000..3493a25ab --- /dev/null +++ b/lib/langchain/chunker/text.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +require "baran" + +module Langchain + module Chunker + # + # Simple text chunker + # + # Usage: + # Langchain::Chunker::Text.new(text).chunks + # + class Text < Base + attr_reader :text, :chunk_size, :chunk_overlap, :separator + + # @param [String] text + # @param [Integer] chunk_size + # @param [Integer] chunk_overlap + # @param [String] separator + def initialize(text, chunk_size: 1000, chunk_overlap: 200, separator: "\n\n") + @text = text + @chunk_size = chunk_size + @chunk_overlap = chunk_overlap + @separator = separator + end + + # @return [Array] + def chunks + splitter = Baran::CharacterTextSplitter.new( + chunk_size: chunk_size, + chunk_overlap: chunk_overlap, + separator: separator + ) + splitter.chunks(text) + end + end + end +end diff --git a/lib/langchain/data.rb b/lib/langchain/data.rb index e3b22aa6e..434fad7f0 100644 --- a/lib/langchain/data.rb +++ b/lib/langchain/data.rb @@ -12,5 +12,9 @@ def initialize(data, options = {}) def value @data end + + def chunks(opts = {}) + Langchain::Chunker::Text.new(@data, **opts).chunks + end end end diff --git a/spec/langchain/chunker/base_spec.rb b/spec/langchain/chunker/base_spec.rb new file mode 100644 index 000000000..65c4237bc --- /dev/null +++ b/spec/langchain/chunker/base_spec.rb @@ -0,0 +1,4 @@ +# frozen_string_literal: true + +RSpec.describe Langchain::Chunker::Base do +end diff --git a/spec/langchain/chunker/text_spec.rb b/spec/langchain/chunker/text_spec.rb new file mode 100644 index 000000000..b5874223c --- /dev/null +++ b/spec/langchain/chunker/text_spec.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +RSpec.describe Langchain::Chunker::Text do + let(:source) { "spec/fixtures/loaders/example.txt" } + let(:text) { File.read(source) } + + subject { + described_class.new(text, + chunk_size: 1000, + chunk_overlap: 200, + separator: "\n\n") + } + + describe "#chunks" do + it "returns an array of chunks" do + expect(Baran::CharacterTextSplitter).to receive(:new).with( + chunk_size: 1000, + chunk_overlap: 200, + separator: "\n\n" + ).and_call_original + + allow_any_instance_of(Baran::CharacterTextSplitter).to receive(:chunks) + .with(text) + .and_call_original + + subject.chunks + end + end +end diff --git a/spec/langchain/data_spec.rb b/spec/langchain/data_spec.rb new file mode 100644 index 000000000..0cb635f37 --- /dev/null +++ b/spec/langchain/data_spec.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +RSpec.describe Langchain::Data do + let(:source) { "spec/fixtures/loaders/example.txt" } + let(:data) { File.read(source) } + + subject { described_class.new(data, source: source) } + + describe "#chunks" do + it "returns an array of chunks" do + chunks = subject.chunks + expect(chunks[0]).to have_key(:text) + expect(chunks[1]).to have_key(:text) + expect(chunks[2]).to have_key(:text) + end + end +end