patterns-ai-core · andreibondarev · May 21, 2023 · May 20, 2023 · May 20, 2023 · May 20, 2023
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -278,6 +278,7 @@ DEPENDENCIES
   hugging-face (~> 0.3.3)
   langchainrb!
   milvus (~> 0.9.0)
+  nokogiri (~> 1.13)
   pdf-reader (~> 1.4)
   pinecone (~> 0.1.6)
   pry-byebug (~> 3.10.0)

diff --git a/README.md b/README.md
@@ -258,6 +258,7 @@ Need to read data from various sources? Load it up.
 | docx | Loaders::Docx | `gem "docx", branch: "master", git: "https://github.com/ruby-docx/docx.git"` |
 | pdf  | Loaders::PDF  | `gem "pdf-reader", "~> 1.4"` |
 | text | Loaders::Text |                              |
+| url  | Loaders::URL  | `gem "nokogiri", "~> 1.13"`  |
 
 ## Examples
 Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)

diff --git a/langchain.gemspec b/langchain.gemspec
@@ -41,6 +41,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "google_search_results", "~> 2.0.0"
   spec.add_development_dependency "hugging-face", "~> 0.3.3"
   spec.add_development_dependency "milvus", "~> 0.9.0"
+  spec.add_development_dependency "nokogiri", "~> 1.13"
   spec.add_development_dependency "pdf-reader", "~> 1.4"
   spec.add_development_dependency "pinecone", "~> 0.1.6"
   spec.add_development_dependency "qdrant-ruby", "~> 0.9.0"

diff --git a/lib/langchain.rb b/lib/langchain.rb
@@ -58,6 +58,7 @@ module Loaders
   autoload :Docx, "loaders/docx"
   autoload :PDF, "loaders/pdf"
   autoload :Text, "loaders/text"
+  autoload :URL, "loaders/url"
 end
 
 autoload :Loader, "loader"

diff --git a/lib/loaders/url.rb b/lib/loaders/url.rb
@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+
+require "open-uri"
+
+module Loaders
+  class URL < Base
+    # We only look for headings and paragraphs
+    TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
+
+    #
+    # This Loader parses URL into a text.
+    # If you'd like to use it directly you can do so like this:
+    # Loaders::URL.new("https://nokogiri.org/").load
+    #
+    def initialize(url)
+      depends_on "nokogiri"
+      require "nokogiri"
+
+      @url = url
+    end
+
+    # Check that url is a valid URL
+    def loadable?
+      !!(@url =~ URI::DEFAULT_PARSER.make_regexp)
+    end
+
+    def load
+      return unless response.status.first == "200"
+
+      doc = Nokogiri::HTML(response.read)
+      doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n")
+    end
+
+    def response
+      @response ||= URI.parse(@url).open
+    end
+  end
+end
diff --git a/spec/loaders/url_spec.rb b/spec/loaders/url_spec.rb
@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+
+RSpec.describe Loaders::URL do
+  let(:url) { "https://www.example.com" }
+  let(:status) { ["200", "OK"] }
+  let(:body) { "<html><body><h1>Lorem Ipsum</h1><p>Dolor sit amet.</p></body></html>" }
+  let(:response) { double("response", status: status, read: body) }
+
+  before do
+    allow(URI).to receive(:parse).and_return(double(open: response))
+  end
+
+  describe "#load" do
+    subject { described_class.new(url).load }
+
+    context "successful response" do
+      it "loads url" do
+        expect(subject).to eq("Lorem Ipsum\n\nDolor sit amet.")
+      end
+    end
+
+    context "error response" do
+      let(:status) { ["404", "Not Found"] }
+      let(:body) { "<html><body><h1>Not Found</h1></body></html>" }
+
+      it "loads url" do
+        expect(subject).to eq(nil)
+      end
+    end
+  end
+
+  describe "#loadable?" do
+    subject { described_class.new(url).loadable? }
+
+    context "with valid url" do
+      it { is_expected.to be_truthy }
+    end
+
+    context "with invalid url" do
+      let(:url) { "invalid url" }
+
+      it { is_expected.to be_falsey }
+    end
+  end
+end