Skip to content

Commit

Permalink
Add exclude_selector to Page model
Browse files Browse the repository at this point in the history
Part of a solution to themarshallproject#239
  • Loading branch information
kevinschaul committed Feb 23, 2019
1 parent 7d890b2 commit 69ad083
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 3 deletions.
2 changes: 1 addition & 1 deletion app/controllers/pages_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,6 @@ def set_page

# Only allow a trusted parameter "white list" through.
def page_params
params.require(:page).permit(:name, :url, :css_selector, subscriptions: [])
params.require(:page).permit(:name, :url, :css_selector, :exclude_selector, subscriptions: [])
end
end
11 changes: 10 additions & 1 deletion app/models/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,16 @@ def document
end

def match_text
document.css(self.css_selector).text
@match = document.css(self.css_selector)

if self.exclude_selector.present?
# Set the content of the exclude selector to the empty string
@match.css(self.exclude_selector).each do |node|
node.content = ""
end
end

@match.text
end

def match_html
Expand Down
5 changes: 5 additions & 0 deletions db/migrate/20190223221320_add_exclude_selector_to_pages.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddExcludeSelectorToPages < ActiveRecord::Migration[5.2]
def change
add_column :pages, :exclude_selector, :string
end
end
3 changes: 2 additions & 1 deletion db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2017_05_26_193719) do
ActiveRecord::Schema.define(version: 2019_02_23_221320) do

# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
Expand Down Expand Up @@ -55,6 +55,7 @@
t.integer "user_id"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.string "exclude_selector"
t.index ["user_id"], name: "index_pages_on_user_id"
end

Expand Down
36 changes: 36 additions & 0 deletions spec/models/page_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,42 @@
expect(@page.match_html.length).to be > 10
end

it "can exclude with single selector" do
@url = "https://www.themarshallproject.org/test-page/"
stub_request(:get, @url).
with(:headers => {'Accept'=>'*/*', 'Accept-Encoding'=>'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'Host'=>'www.themarshallproject.org', 'User-Agent'=>'Ruby'}).
to_return(:status => 200, :body => "<body><div class='keep-me'>Keep this text</div><div class='exclude-me'>Don't keep this text</div></body>", :headers => {})
@page = create(:page, url: @url, css_selector: "body", exclude_selector: ".exclude-me")
expect(@page.match_text).to be == "Keep this text"
end

it "can exclude with multi selector" do
@url = "https://www.themarshallproject.org/test-page/"
stub_request(:get, @url).
with(:headers => {'Accept'=>'*/*', 'Accept-Encoding'=>'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'Host'=>'www.themarshallproject.org', 'User-Agent'=>'Ruby'}).
to_return(:status => 200, :body => "<body><div class='keep-me'>Keep this text</div><div class='exclude-me'>Don't keep this text</div><div class='also-exclude'>Don't keep this either</div></body>", :headers => {})
@page = create(:page, url: @url, css_selector: "body", exclude_selector: ".exclude-me,.also-exclude")
expect(@page.match_text).to be == "Keep this text"
end

it "can exclude with nested content" do
@url = "https://www.themarshallproject.org/test-page/"
stub_request(:get, @url).
with(:headers => {'Accept'=>'*/*', 'Accept-Encoding'=>'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'Host'=>'www.themarshallproject.org', 'User-Agent'=>'Ruby'}).
to_return(:status => 200, :body => "<body><div class='keep-me'>Keep this text</div><div class='exclude-me'>Don't keep this text<div>A nested div</div></div></body>", :headers => {})
@page = create(:page, url: @url, css_selector: "body", exclude_selector: ".exclude-me")
expect(@page.match_text).to be == "Keep this text"
end

it "can work with empty exclude_selector" do
@url = "https://www.themarshallproject.org/test-page/"
stub_request(:get, @url).
with(:headers => {'Accept'=>'*/*', 'Accept-Encoding'=>'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'Host'=>'www.themarshallproject.org', 'User-Agent'=>'Ruby'}).
to_return(:status => 200, :body => "<body><div class='keep-me'>Keep this text</div> <div class='exclude-me'>And keep this text</div></body>", :headers => {})
@page = create(:page, url: @url, css_selector: "body", exclude_selector: "")
expect(@page.match_text).to be == "Keep this text And keep this text"
end

it "can calculate the hash of a page" do
expect(@page.sha2_hash.length).to be == 64
end
Expand Down

0 comments on commit 69ad083

Please sign in to comment.