From 142c5601653cbf7cf048ce9c286be2d72b232c63 Mon Sep 17 00:00:00 2001
From: Alex Chaplinsky <alchaplinsky@gmail.com>
Date: Sat, 20 May 2023 21:35:23 +0200
Subject: [PATCH 1/5] Add url loader to quickly load content from web pages

---
 Gemfile.lock             |  1 +
 langchain.gemspec        |  1 +
 lib/langchain.rb         |  1 +
 lib/loaders/url.rb       | 38 +++++++++++++++++++++++++++++++++
 spec/loaders/url_spec.rb | 45 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 86 insertions(+)
 create mode 100644 lib/loaders/url.rb
 create mode 100644 spec/loaders/url_spec.rb
diff --git a/Gemfile.lock b/Gemfile.lock
index bd4d7d0cb..05bc860bc 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -278,6 +278,7 @@ DEPENDENCIES
   hugging-face (~> 0.3.3)
   langchainrb!
   milvus (~> 0.9.0)
+  nokogiri (~> 1.13)
   pdf-reader (~> 1.4)
   pinecone (~> 0.1.6)
   pry-byebug (~> 3.10.0)
diff --git a/langchain.gemspec b/langchain.gemspec
index 30273f884..03639b2ef 100644
--- a/langchain.gemspec
+++ b/langchain.gemspec
@@ -41,6 +41,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "google_search_results", "~> 2.0.0"
   spec.add_development_dependency "hugging-face", "~> 0.3.3"
   spec.add_development_dependency "milvus", "~> 0.9.0"
+  spec.add_development_dependency "nokogiri", "~> 1.13"
   spec.add_development_dependency "pdf-reader", "~> 1.4"
   spec.add_development_dependency "pinecone", "~> 0.1.6"
   spec.add_development_dependency "qdrant-ruby", "~> 0.9.0"
diff --git a/lib/langchain.rb b/lib/langchain.rb
index 7db2e0129..2aba47f54 100644
--- a/lib/langchain.rb
+++ b/lib/langchain.rb
@@ -58,6 +58,7 @@ module Loaders
   autoload :Docx, "loaders/docx"
   autoload :PDF, "loaders/pdf"
   autoload :Text, "loaders/text"
+  autoload :URL, "loaders/url"
 end
 
 autoload :Loader, "loader"
diff --git a/lib/loaders/url.rb b/lib/loaders/url.rb
new file mode 100644
index 000000000..90207dff8
--- /dev/null
+++ b/lib/loaders/url.rb
@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+
+require 'open-uri'
+
+module Loaders
+  class URL < Base
+    # We only look for headings and paragraphs
+    TEXT_CONTENT_TAGS = %w(h1 h2 h3 h4 h5 h6 p)
+
+    #
+    # This Loader parses URL into a text.
+    # If you'd like to use it directly you can do so like this:
+    # Loaders::URL.new("https://nokogiri.org/").load
+    #
+    def initialize(url)
+      depends_on "nokogiri"
+      require "nokogiri"
+
+      @url = url
+    end
+
+    # Check that url is a valid URL
+    def loadable?
+      !!(@url =~ URI::regexp)
+    end
+
+    def load
+      return unless response.status.first == "200"
+
+      doc = Nokogiri::HTML(response.read)
+      doc.css(TEXT_CONTENT_TAGS.join(',')).map(&:inner_text).join("\n\n")
+    end
+
+    def response
+      @response ||= URI.open(@url)
+    end
+  end
+end
diff --git a/spec/loaders/url_spec.rb b/spec/loaders/url_spec.rb
new file mode 100644
index 000000000..6b2235445
--- /dev/null
+++ b/spec/loaders/url_spec.rb
@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+
+RSpec.describe Loaders::URL do
+  let(:url) { "https://www.example.com" }
+  let(:status) { ['200', 'OK'] }
+  let(:body) { "<html><body><h1>Lorem Ipsum</h1><p>Dolor sit amet.</p></body></html>" }
+  let(:response) { double("response", status: status, read: body) }
+
+  before do
+    allow(URI).to receive(:open).and_return(response)
+  end
+
+  describe "#load" do
+    subject { described_class.new(url).load }
+
+    context 'successful response' do
+      it "loads url" do
+        expect(subject).to eq("Lorem Ipsum\n\nDolor sit amet.")
+      end
+    end
+
+    context 'error response' do
+      let(:status) { ['404', 'Not Found'] }
+      let(:body) { "<html><body><h1>Not Found</h1></body></html>" }
+
+      it "loads url" do
+        expect(subject).to eq(nil)
+      end
+    end
+  end
+
+  describe "#loadable?" do
+    subject { described_class.new(url).loadable? }
+
+    context 'with valid url' do
+      it { is_expected.to be_truthy }
+    end
+
+    context 'with invalid url' do
+      let(:url) { "invalid url" }
+
+      it { is_expected.to be_falsey }
+    end
+  end
+end

From 58cb70daed839506fcf9d79dfd6b03e6f81a8a93 Mon Sep 17 00:00:00 2001
From: Alex Chaplinsky <alchaplinsky@gmail.com>
Date: Sat, 20 May 2023 21:46:27 +0200
Subject: [PATCH 2/5] Stylistic changes

---
 lib/loaders/url.rb       |  6 +++---
 spec/loaders/url_spec.rb | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/lib/loaders/url.rb b/lib/loaders/url.rb
index 90207dff8..cb9007ddc 100644
--- a/lib/loaders/url.rb
+++ b/lib/loaders/url.rb
@@ -1,11 +1,11 @@
 # frozen_string_literal: true
 
-require 'open-uri'
+require "open-uri"
 
 module Loaders
   class URL < Base
     # We only look for headings and paragraphs
-    TEXT_CONTENT_TAGS = %w(h1 h2 h3 h4 h5 h6 p)
+    TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
 
     #
     # This Loader parses URL into a text.
@@ -21,7 +21,7 @@ def initialize(url)
 
     # Check that url is a valid URL
     def loadable?
-      !!(@url =~ URI::regexp)
+      !!(@url =~ URI::DEFAULT_PARSER.make_regexp)
     end
 
     def load
diff --git a/spec/loaders/url_spec.rb b/spec/loaders/url_spec.rb
index 6b2235445..600b25889 100644
--- a/spec/loaders/url_spec.rb
+++ b/spec/loaders/url_spec.rb
@@ -2,7 +2,7 @@
 
 RSpec.describe Loaders::URL do
   let(:url) { "https://www.example.com" }
-  let(:status) { ['200', 'OK'] }
+  let(:status) { ["200", "OK"] }
   let(:body) { "<html><body><h1>Lorem Ipsum</h1><p>Dolor sit amet.</p></body></html>" }
   let(:response) { double("response", status: status, read: body) }
 
@@ -13,14 +13,14 @@
   describe "#load" do
     subject { described_class.new(url).load }
 
-    context 'successful response' do
+    context "successful response" do
       it "loads url" do
         expect(subject).to eq("Lorem Ipsum\n\nDolor sit amet.")
       end
     end
 
-    context 'error response' do
-      let(:status) { ['404', 'Not Found'] }
+    context "error response" do
+      let(:status) { ["404", "Not Found"] }
       let(:body) { "<html><body><h1>Not Found</h1></body></html>" }
 
       it "loads url" do
@@ -32,11 +32,11 @@
   describe "#loadable?" do
     subject { described_class.new(url).loadable? }
 
-    context 'with valid url' do
+    context "with valid url" do
       it { is_expected.to be_truthy }
     end
 
-    context 'with invalid url' do
+    context "with invalid url" do
       let(:url) { "invalid url" }
 
       it { is_expected.to be_falsey }

From 31d5190be8b2a0039a233ed7f3b850985db15ee9 Mon Sep 17 00:00:00 2001
From: Alex Chaplinsky <alchaplinsky@gmail.com>
Date: Sat, 20 May 2023 21:52:18 +0200
Subject: [PATCH 3/5] Mitigate security risk with URI.open

---
 lib/loaders/url.rb       | 4 ++--
 spec/loaders/url_spec.rb | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/loaders/url.rb b/lib/loaders/url.rb
index cb9007ddc..3b52768ff 100644
--- a/lib/loaders/url.rb
+++ b/lib/loaders/url.rb
@@ -28,11 +28,11 @@ def load
       return unless response.status.first == "200"
 
       doc = Nokogiri::HTML(response.read)
-      doc.css(TEXT_CONTENT_TAGS.join(',')).map(&:inner_text).join("\n\n")
+      doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n")
     end
 
     def response
-      @response ||= URI.open(@url)
+      @response ||= URI.parse(@url).open
     end
   end
 end
diff --git a/spec/loaders/url_spec.rb b/spec/loaders/url_spec.rb
index 600b25889..40cd1be95 100644
--- a/spec/loaders/url_spec.rb
+++ b/spec/loaders/url_spec.rb
@@ -7,7 +7,7 @@
   let(:response) { double("response", status: status, read: body) }
 
   before do
-    allow(URI).to receive(:open).and_return(response)
+    allow(URI).to receive(:parse).and_return(double(open: response))
   end
 
   describe "#load" do

From a79ff62bb31547eb8defc350c7509c584014676e Mon Sep 17 00:00:00 2001
From: Alex Chaplinsky <alchaplinsky@gmail.com>
Date: Sun, 21 May 2023 11:37:45 +0200
Subject: [PATCH 4/5] Add URL loader to README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 756fa827e..c2f39d62b 100644
--- a/README.md
+++ b/README.md
@@ -258,6 +258,7 @@ Need to read data from various sources? Load it up.
 | docx | Loaders::Docx | `gem "docx", branch: "master", git: "https://github.com/ruby-docx/docx.git"` |
 | pdf  | Loaders::PDF  | `gem "pdf-reader", "~> 1.4"` |
 | text | Loaders::Text |                              |
+| url  | Loaders::URL  | `gem "nokogiri", "~> 1.13"`  |
 
 ## Examples
 Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)

From bf38c20408b7b93795c96a6ea82fdea08bd76e2b Mon Sep 17 00:00:00 2001
From: Alex Chaplinsky <alchaplinsky@gmail.com>
Date: Sun, 21 May 2023 17:53:48 +0200
Subject: [PATCH 5/5] Rename URL loader to HTML

---
 README.md                                  | 2 +-
 lib/langchain.rb                           | 2 +-
 lib/loaders/{url.rb => html.rb}            | 2 +-
 spec/loaders/{url_spec.rb => html_spec.rb} | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)
 rename lib/loaders/{url.rb => html.rb} (97%)
 rename spec/loaders/{url_spec.rb => html_spec.rb} (97%)

diff --git a/README.md b/README.md
index c2f39d62b..78fff00d0 100644
--- a/README.md
+++ b/README.md
@@ -258,7 +258,7 @@ Need to read data from various sources? Load it up.
 | docx | Loaders::Docx | `gem "docx", branch: "master", git: "https://github.com/ruby-docx/docx.git"` |
 | pdf  | Loaders::PDF  | `gem "pdf-reader", "~> 1.4"` |
 | text | Loaders::Text |                              |
-| url  | Loaders::URL  | `gem "nokogiri", "~> 1.13"`  |
+| html | Loaders::HTML | `gem "nokogiri", "~> 1.13"`  |
 
 ## Examples
 Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)
diff --git a/lib/langchain.rb b/lib/langchain.rb
index 2aba47f54..fe818f28d 100644
--- a/lib/langchain.rb
+++ b/lib/langchain.rb
@@ -58,7 +58,7 @@ module Loaders
   autoload :Docx, "loaders/docx"
   autoload :PDF, "loaders/pdf"
   autoload :Text, "loaders/text"
-  autoload :URL, "loaders/url"
+  autoload :HTML, "loaders/html"
 end
 
 autoload :Loader, "loader"
diff --git a/lib/loaders/url.rb b/lib/loaders/html.rb
similarity index 97%
rename from lib/loaders/url.rb
rename to lib/loaders/html.rb
index 3b52768ff..55d70e9de 100644
--- a/lib/loaders/url.rb
+++ b/lib/loaders/html.rb
@@ -3,7 +3,7 @@
 require "open-uri"
 
 module Loaders
-  class URL < Base
+  class HTML < Base
     # We only look for headings and paragraphs
     TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]
 
diff --git a/spec/loaders/url_spec.rb b/spec/loaders/html_spec.rb
similarity index 97%
rename from spec/loaders/url_spec.rb
rename to spec/loaders/html_spec.rb
index 40cd1be95..2f8e95f8d 100644
--- a/spec/loaders/url_spec.rb
+++ b/spec/loaders/html_spec.rb
@@ -1,6 +1,6 @@
 # frozen_string_literal: true
 
-RSpec.describe Loaders::URL do
+RSpec.describe Loaders::HTML do
   let(:url) { "https://www.example.com" }
   let(:status) { ["200", "OK"] }
   let(:body) { "<html><body><h1>Lorem Ipsum</h1><p>Dolor sit amet.</p></body></html>" }