-
-
Notifications
You must be signed in to change notification settings - Fork 195
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #37 from alchaplinsky/main
🚚 Loaders: URL loader to quickly load content from web pages
- Loading branch information
Showing
6 changed files
with
87 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# frozen_string_literal: true | ||
|
||
require "open-uri" | ||
|
||
module Loaders | ||
class HTML < Base | ||
# We only look for headings and paragraphs | ||
TEXT_CONTENT_TAGS = %w[h1 h2 h3 h4 h5 h6 p] | ||
|
||
# | ||
# This Loader parses URL into a text. | ||
# If you'd like to use it directly you can do so like this: | ||
# Loaders::URL.new("https://nokogiri.org/").load | ||
# | ||
def initialize(url) | ||
depends_on "nokogiri" | ||
require "nokogiri" | ||
|
||
@url = url | ||
end | ||
|
||
# Check that url is a valid URL | ||
def loadable? | ||
!!(@url =~ URI::DEFAULT_PARSER.make_regexp) | ||
end | ||
|
||
def load | ||
return unless response.status.first == "200" | ||
|
||
doc = Nokogiri::HTML(response.read) | ||
doc.css(TEXT_CONTENT_TAGS.join(",")).map(&:inner_text).join("\n\n") | ||
end | ||
|
||
def response | ||
@response ||= URI.parse(@url).open | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# frozen_string_literal: true | ||
|
||
RSpec.describe Loaders::HTML do | ||
let(:url) { "https://www.example.com" } | ||
let(:status) { ["200", "OK"] } | ||
let(:body) { "<html><body><h1>Lorem Ipsum</h1><p>Dolor sit amet.</p></body></html>" } | ||
let(:response) { double("response", status: status, read: body) } | ||
|
||
before do | ||
allow(URI).to receive(:parse).and_return(double(open: response)) | ||
end | ||
|
||
describe "#load" do | ||
subject { described_class.new(url).load } | ||
|
||
context "successful response" do | ||
it "loads url" do | ||
expect(subject).to eq("Lorem Ipsum\n\nDolor sit amet.") | ||
end | ||
end | ||
|
||
context "error response" do | ||
let(:status) { ["404", "Not Found"] } | ||
let(:body) { "<html><body><h1>Not Found</h1></body></html>" } | ||
|
||
it "loads url" do | ||
expect(subject).to eq(nil) | ||
end | ||
end | ||
end | ||
|
||
describe "#loadable?" do | ||
subject { described_class.new(url).loadable? } | ||
|
||
context "with valid url" do | ||
it { is_expected.to be_truthy } | ||
end | ||
|
||
context "with invalid url" do | ||
let(:url) { "invalid url" } | ||
|
||
it { is_expected.to be_falsey } | ||
end | ||
end | ||
end |