From 142c5601653cbf7cf048ce9c286be2d72b232c63 Mon Sep 17 00:00:00 2001 From: Alex Chaplinsky Date: Sat, 20 May 2023 21:35:23 +0200 Subject: [PATCH 1/5] Add url loader to quickly load content from web pages --- Gemfile.lock | 1 + langchain.gemspec | 1 + lib/langchain.rb | 1 + lib/loaders/url.rb | 38 +++++++++++++++++++++++++++++++++ spec/loaders/url_spec.rb | 45 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 86 insertions(+) create mode 100644 lib/loaders/url.rb create mode 100644 spec/loaders/url_spec.rb diff --git a/Gemfile.lock b/Gemfile.lock index bd4d7d0cb..05bc860bc 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -278,6 +278,7 @@ DEPENDENCIES hugging-face (~> 0.3.3) langchainrb! milvus (~> 0.9.0) + nokogiri (~> 1.13) pdf-reader (~> 1.4) pinecone (~> 0.1.6) pry-byebug (~> 3.10.0) diff --git a/langchain.gemspec b/langchain.gemspec index 30273f884..03639b2ef 100644 --- a/langchain.gemspec +++ b/langchain.gemspec @@ -41,6 +41,7 @@ Gem::Specification.new do |spec| spec.add_development_dependency "google_search_results", "~> 2.0.0" spec.add_development_dependency "hugging-face", "~> 0.3.3" spec.add_development_dependency "milvus", "~> 0.9.0" + spec.add_development_dependency "nokogiri", "~> 1.13" spec.add_development_dependency "pdf-reader", "~> 1.4" spec.add_development_dependency "pinecone", "~> 0.1.6" spec.add_development_dependency "qdrant-ruby", "~> 0.9.0" diff --git a/lib/langchain.rb b/lib/langchain.rb index 7db2e0129..2aba47f54 100644 --- a/lib/langchain.rb +++ b/lib/langchain.rb @@ -58,6 +58,7 @@ module Loaders autoload :Docx, "loaders/docx" autoload :PDF, "loaders/pdf" autoload :Text, "loaders/text" + autoload :URL, "loaders/url" end autoload :Loader, "loader" diff --git a/lib/loaders/url.rb b/lib/loaders/url.rb new file mode 100644 index 000000000..90207dff8 --- /dev/null +++ b/lib/loaders/url.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +require 'open-uri' + +module Loaders + class URL < Base + # We only look for headings and paragraphs + TEXT_CONTENT_TAGS = %w(h1 h2 h3 h4 h5 h6 p) + + # + # This Loader parses URL into a text. + # If you'd like to use it directly you can do so like this: + # Loaders::URL.new("https://nokogiri.org/").load + # + def initialize(url) + depends_on "nokogiri" + require "nokogiri" + + @url = url + end + + # Check that url is a valid URL + def loadable? + !!(@url =~ URI::regexp) + end + + def load + return unless response.status.first == "200" + + doc = Nokogiri::HTML(response.read) + doc.css(TEXT_CONTENT_TAGS.join(',')).map(&:inner_text).join("\n\n") + end + + def response + @response ||= URI.open(@url) + end + end +end diff --git a/spec/loaders/url_spec.rb b/spec/loaders/url_spec.rb new file mode 100644 index 000000000..6b2235445 --- /dev/null +++ b/spec/loaders/url_spec.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +RSpec.describe Loaders::URL do + let(:url) { "https://www.example.com" } + let(:status) { ['200', 'OK'] } + let(:body) { "

Lorem Ipsum

Dolor sit amet.

" } + let(:response) { double("response", status: status, read: body) } + + before do + allow(URI).to receive(:open).and_return(response) + end + + describe "#load" do + subject { described_class.new(url).load } + + context 'successful response' do + it "loads url" do + expect(subject).to eq("Lorem Ipsum\n\nDolor sit amet.") + end + end + + context 'error response' do + let(:status) { ['404', 'Not Found'] } + let(:body) { "

Not Found

" } + + it "loads url" do + expect(subject).to eq(nil) + end + end + end + + describe "#loadable?" do + subject { described_class.new(url).loadable? } + + context 'with valid url' do + it { is_expected.to be_truthy } + end + + context 'with invalid url' do + let(:url) { "invalid url" } + + it { is_expected.to be_falsey } + end + end +end From 58cb70daed839506fcf9d79dfd6b03e6f81a8a93 Mon Sep 17 00:00:00 2001 From: Alex Chaplinsky Date: Sat, 20 May 2023 21:46:27 +0200 Subject: [PATCH 2/5] Stylistic changes --- lib/loaders/url.rb | 6 +++--- spec/loaders/url_spec.rb | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/loaders/url.rb b/lib/loaders/url.rb index 90207dff8..cb9007ddc 100644 --- a/lib/loaders/url.rb +++ b/lib/loaders/url.rb @@ -1,11 +1,11 @@ # frozen_string_literal: true -require 'open-uri' +require "open-uri" module Loaders class URL < Base # We only look for headings and paragraphs - TEXT_CONTENT_TAGS = %w(h1 h2 h3 h4 h5 h6 p) + TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p] # # This Loader parses URL into a text. @@ -21,7 +21,7 @@ def initialize(url) # Check that url is a valid URL def loadable? - !!(@url =~ URI::regexp) + !!(@url =~ URI::DEFAULT_PARSER.make_regexp) end def load diff --git a/spec/loaders/url_spec.rb b/spec/loaders/url_spec.rb index 6b2235445..600b25889 100644 --- a/spec/loaders/url_spec.rb +++ b/spec/loaders/url_spec.rb @@ -2,7 +2,7 @@ RSpec.describe Loaders::URL do let(:url) { "https://www.example.com" } - let(:status) { ['200', 'OK'] } + let(:status) { ["200", "OK"] } let(:body) { "

Lorem Ipsum

Dolor sit amet.

" } let(:response) { double("response", status: status, read: body) } @@ -13,14 +13,14 @@ describe "#load" do subject { described_class.new(url).load } - context 'successful response' do + context "successful response" do it "loads url" do expect(subject).to eq("Lorem Ipsum\n\nDolor sit amet.") end end - context 'error response' do - let(:status) { ['404', 'Not Found'] } + context "error response" do + let(:status) { ["404", "Not Found"] } let(:body) { "

Not Found

" } it "loads url" do @@ -32,11 +32,11 @@ describe "#loadable?" do subject { described_class.new(url).loadable? } - context 'with valid url' do + context "with valid url" do it { is_expected.to be_truthy } end - context 'with invalid url' do + context "with invalid url" do let(:url) { "invalid url" } it { is_expected.to be_falsey } From 31d5190be8b2a0039a233ed7f3b850985db15ee9 Mon Sep 17 00:00:00 2001 From: Alex Chaplinsky Date: Sat, 20 May 2023 21:52:18 +0200 Subject: [PATCH 3/5] Mitigate security risk with URI.open --- lib/loaders/url.rb | 4 ++-- spec/loaders/url_spec.rb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/loaders/url.rb b/lib/loaders/url.rb index cb9007ddc..3b52768ff 100644 --- a/lib/loaders/url.rb +++ b/lib/loaders/url.rb @@ -28,11 +28,11 @@ def load return unless response.status.first == "200" doc = Nokogiri::HTML(response.read) - doc.css(TEXT_CONTENT_TAGS.join(',')).map(&:inner_text).join("\n\n") + doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n") end def response - @response ||= URI.open(@url) + @response ||= URI.parse(@url).open end end end diff --git a/spec/loaders/url_spec.rb b/spec/loaders/url_spec.rb index 600b25889..40cd1be95 100644 --- a/spec/loaders/url_spec.rb +++ b/spec/loaders/url_spec.rb @@ -7,7 +7,7 @@ let(:response) { double("response", status: status, read: body) } before do - allow(URI).to receive(:open).and_return(response) + allow(URI).to receive(:parse).and_return(double(open: response)) end describe "#load" do From a79ff62bb31547eb8defc350c7509c584014676e Mon Sep 17 00:00:00 2001 From: Alex Chaplinsky Date: Sun, 21 May 2023 11:37:45 +0200 Subject: [PATCH 4/5] Add URL loader to README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 756fa827e..c2f39d62b 100644 --- a/README.md +++ b/README.md @@ -258,6 +258,7 @@ Need to read data from various sources? Load it up. | docx | Loaders::Docx | `gem "docx", branch: "master", git: "https://github.com/ruby-docx/docx.git"` | | pdf | Loaders::PDF | `gem "pdf-reader", "~> 1.4"` | | text | Loaders::Text | | +| url | Loaders::URL | `gem "nokogiri", "~> 1.13"` | ## Examples Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples) From bf38c20408b7b93795c96a6ea82fdea08bd76e2b Mon Sep 17 00:00:00 2001 From: Alex Chaplinsky Date: Sun, 21 May 2023 17:53:48 +0200 Subject: [PATCH 5/5] Rename URL loader to HTML --- README.md | 2 +- lib/langchain.rb | 2 +- lib/loaders/{url.rb => html.rb} | 2 +- spec/loaders/{url_spec.rb => html_spec.rb} | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename lib/loaders/{url.rb => html.rb} (97%) rename spec/loaders/{url_spec.rb => html_spec.rb} (97%) diff --git a/README.md b/README.md index c2f39d62b..78fff00d0 100644 --- a/README.md +++ b/README.md @@ -258,7 +258,7 @@ Need to read data from various sources? Load it up. | docx | Loaders::Docx | `gem "docx", branch: "master", git: "https://github.com/ruby-docx/docx.git"` | | pdf | Loaders::PDF | `gem "pdf-reader", "~> 1.4"` | | text | Loaders::Text | | -| url | Loaders::URL | `gem "nokogiri", "~> 1.13"` | +| html | Loaders::HTML | `gem "nokogiri", "~> 1.13"` | ## Examples Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples) diff --git a/lib/langchain.rb b/lib/langchain.rb index 2aba47f54..fe818f28d 100644 --- a/lib/langchain.rb +++ b/lib/langchain.rb @@ -58,7 +58,7 @@ module Loaders autoload :Docx, "loaders/docx" autoload :PDF, "loaders/pdf" autoload :Text, "loaders/text" - autoload :URL, "loaders/url" + autoload :HTML, "loaders/html" end autoload :Loader, "loader" diff --git a/lib/loaders/url.rb b/lib/loaders/html.rb similarity index 97% rename from lib/loaders/url.rb rename to lib/loaders/html.rb index 3b52768ff..55d70e9de 100644 --- a/lib/loaders/url.rb +++ b/lib/loaders/html.rb @@ -3,7 +3,7 @@ require "open-uri" module Loaders - class URL < Base + class HTML < Base # We only look for headings and paragraphs TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p] diff --git a/spec/loaders/url_spec.rb b/spec/loaders/html_spec.rb similarity index 97% rename from spec/loaders/url_spec.rb rename to spec/loaders/html_spec.rb index 40cd1be95..2f8e95f8d 100644 --- a/spec/loaders/url_spec.rb +++ b/spec/loaders/html_spec.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -RSpec.describe Loaders::URL do +RSpec.describe Loaders::HTML do let(:url) { "https://www.example.com" } let(:status) { ["200", "OK"] } let(:body) { "

Lorem Ipsum

Dolor sit amet.

" }