-
-
Notifications
You must be signed in to change notification settings - Fork 2.4k
/
Copy pathurl_scraper.rb
210 lines (171 loc) · 5.06 KB
/
url_scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
module Docs
class UrlScraper < Scraper
class << self
attr_accessor :params
attr_accessor :headers
attr_accessor :force_gzip
def inherited(subclass)
super
subclass.params = params.deep_dup
subclass.headers = headers.deep_dup
subclass.force_gzip = force_gzip
end
end
@@rate_limiter = nil
self.params = {}
self.headers = { 'User-Agent' => 'DevDocs' }
self.force_gzip = false
private
def request_one(url)
Request.run url, request_options
end
def request_all(urls, &block)
if options[:rate_limit]
if @@rate_limiter
@@rate_limiter.limit = options[:rate_limit]
else
@@rate_limiter = RateLimiter.new(options[:rate_limit])
Typhoeus.before(&@@rate_limiter.to_proc)
end
end
Requester.run urls, request_options: request_options, &block
end
def request_options
options = { params: self.class.params, headers: self.class.headers }
options[:accept_encoding] = 'gzip' if self.class.force_gzip
options
end
def process_response?(response)
if response.error?
raise <<~ERROR
Error status code (#{response.code}): #{response.return_message}
#{response.url}
#{JSON.pretty_generate(response.headers).slice(2..-3)}
ERROR
elsif response.blank?
raise "Empty response body: #{response.url}"
end
response.success? && response.html? && process_url?(response.effective_url)
end
def process_url?(url)
base_url.contains?(url)
end
def load_capybara_selenium
require 'capybara/dsl'
require 'selenium/webdriver'
Capybara.register_driver :chrome do |app|
options = Selenium::WebDriver::Chrome::Options.new(args: %w[headless disable-gpu])
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options)
end
Capybara.javascript_driver = :chrome
Capybara.current_driver = :chrome
Capybara.run_server = false
Capybara
end
module MultipleBaseUrls
def self.included(base)
base.extend ClassMethods
end
module ClassMethods
attr_reader :base_urls
def base_urls=(urls)
self.base_url = urls.first
@base_urls = urls
end
end
def initial_urls
super + self.class.base_urls[1..-1].deep_dup
end
def base_urls
@base_urls ||= self.class.base_urls.map { |url| URL.parse(url) }
end
private
def process_url?(url)
base_urls.any? { |base_url| base_url.contains?(url) }
end
def process_response(response)
original_scheme = self.base_url.scheme
original_host = self.base_url.host
original_path = self.base_url.path
effective_base_url = self.base_urls.find { |base_url| base_url.contains?(response.effective_url) }
self.base_url.scheme = effective_base_url.scheme
self.base_url.host = effective_base_url.host
self.base_url.path = effective_base_url.path
super
ensure
self.base_url.scheme = original_scheme
self.base_url.host = original_host
self.base_url.path = original_path
end
end
module FixRedirectionsBehavior
def self.included(base)
base.extend ClassMethods
end
def self.prepended(base)
class << base
prepend ClassMethods
end
end
module ClassMethods
def redirections
@redirections
end
def store_pages(store)
instrument 'info.doc', msg: 'Fetching redirections...'
with_redirections do
instrument 'info.doc', msg: 'Continuing...'
super
end
end
private
def with_redirections
@redirections = new.fetch_redirections
yield
ensure
@redirections = nil
end
end
def fetch_redirections
result = {}
with_filters 'apply_base_url', 'container', 'normalize_urls', 'internal_urls' do
build_pages do |page|
next if page[:response_effective_path] == page[:response_path]
result[page[:response_path].downcase] = page[:response_effective_path]
end
end
result
end
private
def process_response(response)
super.merge! response_effective_path: response.effective_path, response_path: response.path
end
def additional_options
super.merge! redirections: self.class.redirections
end
end
class RateLimiter
attr_accessor :limit
def initialize(limit)
@limit = limit
@minute = nil
@counter = 0
end
def call(*)
if @minute != Time.now.min
@minute = Time.now.min
@counter = 0
end
@counter += 1
if @counter >= @limit
wait = Time.now.end_of_minute.to_i - Time.now.to_i + 1
sleep wait
end
true
end
def to_proc
method(:call).to_proc
end
end
end
end