-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.rb
69 lines (58 loc) · 1.61 KB
/
crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
require 'typhoeus'
class Crawler
attr_accessor :clan_url, :steam_ids, :errors
def initialize(clan_url)
@clan_url = clan_url
@hydra = Typhoeus::Hydra.new(max_concurrency: 8)
@steam_ids = []
@errors = []
end
def go!
crawler [@clan_url]
@hydra.run
end
private
def crawler(urls)
urls.each do |url|
regex, steam_id = regex_determiner(url)
if steam_id
@steam_ids << get_steam_id_from_url(url, regex)
elsif regex
crawl url, regex
else
@errors << "Bad Url: #{url}"
end
end
end
def crawl(url, regex)
@hydra.queue(page_source = Typhoeus::Request.new(url))
page_source.on_complete do |response|
if response.success?
urls = scrape_page_source(response.body, regex)
crawler urls
else
@errors << response.options.to_s
end
end
end
def get_steam_id_from_url(url, regex)
scrape_page_source(url, regex).first
end
def regex_determiner(url)
case url
when /(http\:\/\/\w+\.gameme\.com\/tf)/
return /(http\:\/\/\w+\.gameme\.com\/overview\/\d+)/, false
when /(http\:\/\/\w+\.gameme\.com\/overview\/\d+)/
return /(http\:\/\/\w+\.gameme\.com\/playerinfo\/\d+)/, false
when /(http\:\/\/\w+\.gameme\.com\/playerinfo\/\d+)/
return /http\:\/\/steamcommunity\.com\/profiles\/\d+/, false
when /http\:\/\/steamcommunity\.com\/profiles\/\d+/
return /http\:\/\/steamcommunity\.com\/profiles\/(\d+)/, true
else
# Bad Url - return nil
end
end
def scrape_page_source(page_source, regex)
page_source.scan(regex).compact.uniq.flatten
end
end