-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
871c0d7
commit 8c63b8a
Showing
5 changed files
with
142 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,76 +1,9 @@ | ||
require 'jaro_winkler/fallback' | ||
require 'jaro_winkler/adjusting_table' | ||
require 'jaro_winkler/jaro_winkler.so' unless JaroWinkler.fallback? | ||
module JaroWinkler | ||
module_function | ||
def jaro_distance s1, s2, options = {} | ||
options[:adj_table] | ||
length1, length2 = s1.length, s2.length | ||
# Guarantee the length order | ||
if s1.length > s2.length | ||
s1, s2 = s2, s1 | ||
length1, length2 = length2, length1 | ||
end | ||
window_size = (length2 / 2) - 1 | ||
window_size = 0 if window_size < 0 | ||
matches = 0.0 | ||
sim_matches = 0.0 | ||
transpositions = 0 | ||
previous_index = -1 | ||
max_index = length2 - 1 | ||
s1.chars.each_with_index do |c1, i| | ||
left = i - window_size | ||
right = i + window_size | ||
left = 0 if left < 0 | ||
right = max_index if right > max_index | ||
matched = false | ||
sim_matched = false | ||
found = false | ||
s2[left..right].chars.each_with_index do |c2, j| | ||
if c1 == c2 | ||
matched = true | ||
s2_index = left + j | ||
if !found && s2_index > previous_index | ||
previous_index = s2_index | ||
found = true | ||
end | ||
elsif options[:adj_table] && DEFAULT_ADJ_TABLE[c1][c2] | ||
sim_matched = true | ||
end | ||
end | ||
if matched | ||
matches += 1 | ||
transpositions += 1 unless found | ||
elsif sim_matched # not matched but similarly matched | ||
sim_matches += 3 | ||
end | ||
end | ||
# Don't divide transpositions by 2 since it's been counted directly by above code. | ||
similarity = matches | ||
similarity += sim_matches / 10 if options[:adj_table] | ||
matches == 0 ? 0 : (similarity / length1 + similarity / length2 + (matches - transpositions) / matches) / 3.0 | ||
end | ||
require 'jaro_winkler/version' | ||
|
||
def r_distance s1, s2, options = {} | ||
options = {weight: 0.1, threshold: 0.7, ignore_case: false, adj_table: false}.merge options | ||
weight, threshold, ignore_case = options[:weight], options[:threshold], options[:ignore_case] | ||
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25 | ||
s1, s2 = s1.upcase, s2.upcase if ignore_case | ||
distance = jaro_distance(s1, s2, options) | ||
prefix = 0 | ||
max_length = [4, s1.length, s2.length].min | ||
s1[0, max_length].chars.each_with_index do |c1, i| | ||
c1 == s2[i] ? prefix += 1 : break | ||
end | ||
distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance)) | ||
end | ||
|
||
if JaroWinkler.fallback? | ||
alias :distance :r_distance | ||
alias :c_distance :r_distance | ||
module_function :distance, :c_distance | ||
else | ||
alias :distance :c_distance | ||
module_function :distance | ||
end | ||
end | ||
case RUBY_PLATFORM | ||
when 'java' | ||
require 'jaro_winkler/jaro_winkler_pure' | ||
else | ||
require 'jaro_winkler/jaro_winkler_ext' | ||
end | ||
|
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
require 'jaro_winkler/adjusting_table' | ||
module JaroWinkler | ||
class Error < RuntimeError; end | ||
class InvalidWeightError < Error; end | ||
|
||
DEFAULT_WEIGHT = 0.1 | ||
DEFAULT_THRESHOLD = 0.7 | ||
DEFAULT_OPTIONS = { | ||
jaro: {adj_table: false, ignore_case: false}, | ||
jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD} | ||
} | ||
|
||
module_function | ||
|
||
def distance str1, str2, options={} | ||
_distance str1.codepoints, str2.codepoints, options | ||
end | ||
|
||
def jaro_distance str1, str2, options={} | ||
_jaro_distance str1.codepoints, str2.codepoints, options | ||
end | ||
|
||
def _distance codes1, codes2, options={} | ||
options = DEFAULT_OPTIONS[:jaro_winkler].merge options | ||
raise InvalidWeightError if options[:weight] > 0.25 | ||
jaro_distance = _jaro_distance(codes1, codes2, options); | ||
|
||
if jaro_distance < options[:threshold] | ||
jaro_distance | ||
else | ||
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length | ||
len1, len2 = codes1.length, codes2.length | ||
max_4 = len1 > 4 ? 4 : len1 | ||
prefix = 0 | ||
while prefix < max_4 && codes1[prefix] == codes2[prefix] | ||
prefix += 1 | ||
end | ||
jaro_distance + prefix * options[:weight] * (1 - jaro_distance) | ||
end | ||
end | ||
|
||
def _jaro_distance codes1, codes2, options={} | ||
options = DEFAULT_OPTIONS[:jaro].merge options | ||
|
||
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length | ||
len1, len2 = codes1.length, codes2.length | ||
return 0.0 if len1 == 0 || len2 == 0 | ||
|
||
if options[:ignore_case] | ||
codes1.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c } | ||
codes2.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c } | ||
end | ||
|
||
window = len2/2 - 1 | ||
window = 0 if(window < 0) | ||
flags1, flags2 = 0, 0 | ||
|
||
# // count number of matching characters | ||
match_count = 0; | ||
i = 0 | ||
while i < len1 | ||
left = (i >= window) ? i - window : 0 | ||
right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1) | ||
right = len2 - 1 if right > len2 - 1 | ||
j = left | ||
while j <= right | ||
if flags2[j] == 0 && codes1[i] == codes2[j] | ||
flags1 |= (1 << i) | ||
flags2 |= (1 << j) | ||
match_count += 1 | ||
break | ||
end | ||
j +=1 | ||
end | ||
i += 1 | ||
end | ||
|
||
return 0.0 if match_count == 0 | ||
|
||
# // count number of transpositions | ||
transposition_count = j = k = 0 | ||
i = 0 | ||
while i < len1 | ||
if flags1[i] == 1 | ||
j = k | ||
while j < len2 | ||
if flags2[j] == 1 | ||
k = j + 1; | ||
break; | ||
end | ||
j += 1 | ||
end | ||
transposition_count += 1 if codes1[i] != codes2[j] | ||
end | ||
i += 1 | ||
end | ||
|
||
# // count similarities in nonmatched characters | ||
similar_count = 0 | ||
if options[:adj_table] && len1 > match_count | ||
i = 0 | ||
while i < len1 | ||
if flags1[i] == 0 | ||
j = 0 | ||
while j < len2 | ||
if flags2[j] == 0 | ||
if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)] | ||
similar_count += 3 | ||
break | ||
end | ||
end | ||
j += 1 | ||
end | ||
end | ||
i += 1 | ||
end | ||
end | ||
|
||
m = match_count.to_f | ||
t = transposition_count/2 | ||
m = similar_count/10.0 + m if options[:adj_table] | ||
(m/len1 + m/len2 + (m-t)/m) / 3 | ||
end | ||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
module JaroWinkler | ||
VERSION = "1.3.7" | ||
VERSION = '1.4.0' | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters