-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoogle.rb
105 lines (78 loc) · 2.7 KB
/
google.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
require 'net/http'
require 'open-uri'
require 'rubygems'
require 'nokogiri'
class Google
def Google.perform_search(query, num_results = 100, domain = "ie")
query.rstrip!
query.gsub!(/\s/, "+")
query.gsub!(/\"/, "%22")
# Determine number of full pages and additional results
query_urls = []
results = []
num_full_pages = num_results / 100
num_additional_results = num_results % 100
# Construct all the query URLs we need to GET
x = 0
num_full_pages.times do
query_urls << "http://www.google.#{domain}/search?q=#{query}&num=100&start=#{x * 100}&safe=off"
x = x.next
end
if num_additional_results != 0
query_urls << "http://www.google.#{domain}/search?q=#{query}&num=#{num_additional_results}&start=#{x * 100}&safe=off"
end
# GET each query URL, and proccess it's content for
# result links (of the CSS class "h3.r a" for Google)
query_urls.each do |url|
uri = URI.parse(url)
response = Net::HTTP.get_response(uri)
doc = Nokogiri::HTML(response.body)
doc.css("h3.r a").each { |link| results << link['href'] if link['href'] =~ /^https?:/ }
end
# return the URLs of the search results
results
end
private
def Google.get_results(query_urls)
threads, results = [], []
query_urls.each do |url|
threads << Thread.new(url) do |page|
file = open(page)
doc = Nokogiri::HTML(file.read)
doc.css("h3.r a").each { |link| results << link['href'] if link['href'] =~ /^https?:/ }
end
end
threads.each { |thread| thread.join }
results
end
def Google.sanitize_query!(query)
query.rstrip!
query.gsub!(/\s/, "+")
query.gsub!(/\"/, "%22")
end
def Google.construct_query_urls(query, num_results = 100, domain = ".com")
domain = domain
query = query
num_results = num_results
query_urls = []
num_full_pages = num_results / 100
num_additional_results = num_results % 100
x = 0
num_full_pages.times do
query_urls << "http://www.google.#{domain}/search?q=#{query}&num=100&start=#{x * 100}&safe=off"
x = x.next
end
if num_additional_results != 0
query_urls << "http://www.google.#{domain}/search?q=#{query}&num=#{num_additional_results}&start=#{x * 100}&safe=off"
end
query_urls
end
def Google.perform_search2(params)
query = params[:query] || ""
num_results = params[:num_results] || 100
domain = params[:domain] || "com"
Google.sanitize_query!(query)
query_urls = Google.construct_query_urls(query, num_results, domain)
Google.get_results(query_urls)
end
end