-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathedit_distance_metric.rb
executable file
·86 lines (63 loc) · 2.13 KB
/
edit_distance_metric.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env ruby
# Author: Richard Stokes, 2011
# Script that tests email search using an edit-distance metric
# and a probable email address constructed from keywords,
# e.g. richardstokes@ucd.ie would be a probable email address for
# the keywords "Richard Stokes UCD"
# In this scoring metric, the lower the score, the
# likelier an email address is to be acurate
require 'google'
require 'emailaddress'
require 'email_scraper'
require 'editdistance'
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'functions'
require 'pp'
# Creates and gathers the relevant containers and input data
email_addresses = Array.new
puts "Who are you searching for?"
person = gets.rstrip
puts "Enter a related keyword(s) to aid the search:"
keywords = gets.rstrip
likely_email = create_email(person, keywords, "ie")
puts "Likely email: #{likely_email}"
# Performs the google search
puts "\nPerforming search:"
urls = Google.perform_search("#{person} #{keywords}", 50)
urls.each do |url|
puts url
# skip .pdf and .doc files - unable to effectively parse text contents
next if url.match(/(.pdf|.doc)\z/)
begin
html = get_page_body(url)
temp = EmailScraper.get_emails(html)
if !temp.empty?
puts "Emails:"
pp temp
puts "\n"
else
puts "No emails found.\n\n"
end
temp.each do |address|
duplicates = email_addresses.select { |email| email.address == address }
duplicates.each { |duplicate| email_addresses.delete(duplicate) } if !duplicates.empty?
ranking = EditDistance.dameraulevenshtein(likely_email, address)
email_addresses << EmailAddress.new(address, ranking)
end
rescue
puts "Error: Couldn't retrieve page.\n\n"
end
end
# Sort the email addresses according to their Damerau-Levenshtein distance
# from the probable email address constructed at the start
email_addresses.sort!
# Write each address and it's ranking to file
filename = 'edit_distance_results.txt'
File.open(filename, 'w') do |f|
email_addresses.each do |email|
f.write("#{email.address} : #{email.ranking}\n")
end
end
`gedit edit_distance_results.txt`