Skip to content

Commit

Permalink
Merge pull request #37 from alchaplinsky/main
Browse files Browse the repository at this point in the history
🚚 Loaders: URL loader to quickly load content from web pages
  • Loading branch information
andreibondarev authored May 21, 2023
2 parents 33021f7 + bf38c20 commit 1c2165c
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 0 deletions.
1 change: 1 addition & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ DEPENDENCIES
hugging-face (~> 0.3.4)
langchainrb!
milvus (~> 0.9.0)
nokogiri (~> 1.13)
pdf-reader (~> 1.4)
pinecone (~> 0.1.6)
pry-byebug (~> 3.10.0)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ Need to read data from various sources? Load it up.
| docx | Loaders::Docx | `gem "docx", branch: "master", git: "https://github.com/ruby-docx/docx.git"` |
| pdf | Loaders::PDF | `gem "pdf-reader", "~> 1.4"` |
| text | Loaders::Text | |
| html | Loaders::HTML | `gem "nokogiri", "~> 1.13"` |

## Examples
Additional examples available: [/examples](https://github.com/andreibondarev/langchainrb/tree/main/examples)
Expand Down
1 change: 1 addition & 0 deletions langchain.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Gem::Specification.new do |spec|
spec.add_development_dependency "google_search_results", "~> 2.0.0"
spec.add_development_dependency "hugging-face", "~> 0.3.4"
spec.add_development_dependency "milvus", "~> 0.9.0"
spec.add_development_dependency "nokogiri", "~> 1.13"
spec.add_development_dependency "pdf-reader", "~> 1.4"
spec.add_development_dependency "pinecone", "~> 0.1.6"
spec.add_development_dependency "replicate-ruby"
Expand Down
1 change: 1 addition & 0 deletions lib/langchain.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ module Loaders
autoload :Docx, "loaders/docx"
autoload :PDF, "loaders/pdf"
autoload :Text, "loaders/text"
autoload :HTML, "loaders/html"
end

autoload :Loader, "loader"
Expand Down
38 changes: 38 additions & 0 deletions lib/loaders/html.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# frozen_string_literal: true

require "open-uri"

module Loaders
class HTML < Base
# We only look for headings and paragraphs
TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p]

#
# This Loader parses URL into a text.
# If you'd like to use it directly you can do so like this:
# Loaders::URL.new("https://nokogiri.org/").load
#
def initialize(url)
depends_on "nokogiri"
require "nokogiri"

@url = url
end

# Check that url is a valid URL
def loadable?
!!(@url =~ URI::DEFAULT_PARSER.make_regexp)
end

def load
return unless response.status.first == "200"

doc = Nokogiri::HTML(response.read)
doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n")
end

def response
@response ||= URI.parse(@url).open
end
end
end
45 changes: 45 additions & 0 deletions spec/loaders/html_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# frozen_string_literal: true

RSpec.describe Loaders::HTML do
let(:url) { "https://www.example.com" }
let(:status) { ["200", "OK"] }
let(:body) { "<html><body><h1>Lorem Ipsum</h1><p>Dolor sit amet.</p></body></html>" }
let(:response) { double("response", status: status, read: body) }

before do
allow(URI).to receive(:parse).and_return(double(open: response))
end

describe "#load" do
subject { described_class.new(url).load }

context "successful response" do
it "loads url" do
expect(subject).to eq("Lorem Ipsum\n\nDolor sit amet.")
end
end

context "error response" do
let(:status) { ["404", "Not Found"] }
let(:body) { "<html><body><h1>Not Found</h1></body></html>" }

it "loads url" do
expect(subject).to eq(nil)
end
end
end

describe "#loadable?" do
subject { described_class.new(url).loadable? }

context "with valid url" do
it { is_expected.to be_truthy }
end

context "with invalid url" do
let(:url) { "invalid url" }

it { is_expected.to be_falsey }
end
end
end

0 comments on commit 1c2165c

Please sign in to comment.