-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.rb
executable file
·100 lines (74 loc) · 2.4 KB
/
run.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env ruby
# Author: Richard Stokes, 2011
# This script runs a custom scoring method designed
# by myself for ranking email addresses in terms of
# their likely relevance to our query.
# Email addresses are ranked on their position
# in the Google search results, their degree of repetition
# within the search result body and the presence within
# the email address string of our query keywords.
require 'google'
require 'email_scraper'
require 'emailaddress'
require 'rubygems'
require 'mechanize'
require 'nokogiri'
require 'open-uri'
require 'functions'
GOOGLE_RANKING_WEIGHT = 0.05
REPETITION_WEIGHT = 0.8
KEYWORD_PRESENCE_WEIGHT = 0.8
email_addresses = Array.new
puts "What is your query?"
query = gets
keywords = query.split
regexes = Array.new
keywords.each { |keyword| regexes << Regexp.new(keyword, true) }
agent = Mechanize.new
puts "Performing Google search for: #{query}"
urls = Google.perform_search(query)
x = 0
urls.each do |url|
page_ranking = urls.size - x
puts "Processing: #{url}"
puts "Ranking: #{page_ranking}"
begin
html = get_page_body(url)
temp = EmailScraper.get_emails(html)
puts "Emails:"
temp.each { |string| puts string }
puts "\n"
temp.each do |address|
# Scans for duplicate email addresses and amends score accordingly
duplicates = email_addresses.select { |email| email.address == address }
email = EmailAddress.new(address, page_ranking * GOOGLE_RANKING_WEIGHT)
if !duplicates.empty?
email.ranking += (duplicates.size * REPETITION_WEIGHT)
duplicates.each do |duplicate|
email_addresses.delete(duplicate)
end
end
email_addresses << email
# Else, if there are duplicates, sum the scores of all the duplicates and
end
rescue
puts "Error: Couldn't retrieve page\n\n"
end
x = x.succ
end
# Adds to an email addresses score if any of the
# search keywords are present in the email address string
email_addresses.each do |email_address|
regexes.each do |regex|
email_address.ranking += (1 * KEYWORD_PRESENCE_WEIGHT) if email_address.address.match(regex)
end
end
email_addresses.sort!
email_addresses.reverse!
# Write each address and it's ranking to file
filename = 'custom_scoring_results.txt'
File.open(filename, 'w') do |f|
email_addresses.each do |email|
f.write("#{email.address} : #{email.ranking}\n")
end
end