Skip to content

Commit

Permalink
reimplement pure ruby version
Browse files Browse the repository at this point in the history
  • Loading branch information
tonytonyjan committed Dec 12, 2015
1 parent 871c0d7 commit 8c63b8a
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 84 deletions.
83 changes: 8 additions & 75 deletions lib/jaro_winkler.rb
Original file line number Diff line number Diff line change
@@ -1,76 +1,9 @@
require 'jaro_winkler/fallback'
require 'jaro_winkler/adjusting_table'
require 'jaro_winkler/jaro_winkler.so' unless JaroWinkler.fallback?
module JaroWinkler
module_function
def jaro_distance s1, s2, options = {}
options[:adj_table]
length1, length2 = s1.length, s2.length
# Guarantee the length order
if s1.length > s2.length
s1, s2 = s2, s1
length1, length2 = length2, length1
end
window_size = (length2 / 2) - 1
window_size = 0 if window_size < 0
matches = 0.0
sim_matches = 0.0
transpositions = 0
previous_index = -1
max_index = length2 - 1
s1.chars.each_with_index do |c1, i|
left = i - window_size
right = i + window_size
left = 0 if left < 0
right = max_index if right > max_index
matched = false
sim_matched = false
found = false
s2[left..right].chars.each_with_index do |c2, j|
if c1 == c2
matched = true
s2_index = left + j
if !found && s2_index > previous_index
previous_index = s2_index
found = true
end
elsif options[:adj_table] && DEFAULT_ADJ_TABLE[c1][c2]
sim_matched = true
end
end
if matched
matches += 1
transpositions += 1 unless found
elsif sim_matched # not matched but similarly matched
sim_matches += 3
end
end
# Don't divide transpositions by 2 since it's been counted directly by above code.
similarity = matches
similarity += sim_matches / 10 if options[:adj_table]
matches == 0 ? 0 : (similarity / length1 + similarity / length2 + (matches - transpositions) / matches) / 3.0
end
require 'jaro_winkler/version'

def r_distance s1, s2, options = {}
options = {weight: 0.1, threshold: 0.7, ignore_case: false, adj_table: false}.merge options
weight, threshold, ignore_case = options[:weight], options[:threshold], options[:ignore_case]
raise 'Scaling factor should not exceed 0.25, otherwise the distance can become larger than 1' if weight > 0.25
s1, s2 = s1.upcase, s2.upcase if ignore_case
distance = jaro_distance(s1, s2, options)
prefix = 0
max_length = [4, s1.length, s2.length].min
s1[0, max_length].chars.each_with_index do |c1, i|
c1 == s2[i] ? prefix += 1 : break
end
distance < threshold ? distance : distance + ((prefix * weight) * (1 - distance))
end

if JaroWinkler.fallback?
alias :distance :r_distance
alias :c_distance :r_distance
module_function :distance, :c_distance
else
alias :distance :c_distance
module_function :distance
end
end
case RUBY_PLATFORM
when 'java'
require 'jaro_winkler/jaro_winkler_pure'
else
require 'jaro_winkler/jaro_winkler_ext'
end

6 changes: 0 additions & 6 deletions lib/jaro_winkler/fallback.rb

This file was deleted.

125 changes: 125 additions & 0 deletions lib/jaro_winkler/jaro_winkler_pure.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
require 'jaro_winkler/adjusting_table'
module JaroWinkler
class Error < RuntimeError; end
class InvalidWeightError < Error; end

DEFAULT_WEIGHT = 0.1
DEFAULT_THRESHOLD = 0.7
DEFAULT_OPTIONS = {
jaro: {adj_table: false, ignore_case: false},
jaro_winkler: {weight: DEFAULT_WEIGHT, threshold: DEFAULT_THRESHOLD}
}

module_function

def distance str1, str2, options={}
_distance str1.codepoints, str2.codepoints, options
end

def jaro_distance str1, str2, options={}
_jaro_distance str1.codepoints, str2.codepoints, options
end

def _distance codes1, codes2, options={}
options = DEFAULT_OPTIONS[:jaro_winkler].merge options
raise InvalidWeightError if options[:weight] > 0.25
jaro_distance = _jaro_distance(codes1, codes2, options);

if jaro_distance < options[:threshold]
jaro_distance
else
codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
len1, len2 = codes1.length, codes2.length
max_4 = len1 > 4 ? 4 : len1
prefix = 0
while prefix < max_4 && codes1[prefix] == codes2[prefix]
prefix += 1
end
jaro_distance + prefix * options[:weight] * (1 - jaro_distance)
end
end

def _jaro_distance codes1, codes2, options={}
options = DEFAULT_OPTIONS[:jaro].merge options

codes1, codes2 = codes2, codes1 if codes1.length > codes2.length
len1, len2 = codes1.length, codes2.length
return 0.0 if len1 == 0 || len2 == 0

if options[:ignore_case]
codes1.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
codes2.map!{ |c| c >= 97 && c <= 122 ? c -= 32 : c }
end

window = len2/2 - 1
window = 0 if(window < 0)
flags1, flags2 = 0, 0

# // count number of matching characters
match_count = 0;
i = 0
while i < len1
left = (i >= window) ? i - window : 0
right = (i + window <= len2 - 1) ? (i + window) : (len2 - 1)
right = len2 - 1 if right > len2 - 1
j = left
while j <= right
if flags2[j] == 0 && codes1[i] == codes2[j]
flags1 |= (1 << i)
flags2 |= (1 << j)
match_count += 1
break
end
j +=1
end
i += 1
end

return 0.0 if match_count == 0

# // count number of transpositions
transposition_count = j = k = 0
i = 0
while i < len1
if flags1[i] == 1
j = k
while j < len2
if flags2[j] == 1
k = j + 1;
break;
end
j += 1
end
transposition_count += 1 if codes1[i] != codes2[j]
end
i += 1
end

# // count similarities in nonmatched characters
similar_count = 0
if options[:adj_table] && len1 > match_count
i = 0
while i < len1
if flags1[i] == 0
j = 0
while j < len2
if flags2[j] == 0
if DEFAULT_ADJ_TABLE[codes1[i].chr(Encoding::UTF_8)][codes2[j].chr(Encoding::UTF_8)]
similar_count += 3
break
end
end
j += 1
end
end
i += 1
end
end

m = match_count.to_f
t = transposition_count/2
m = similar_count/10.0 + m if options[:adj_table]
(m/len1 + m/len2 + (m-t)/m) / 3
end

end
2 changes: 1 addition & 1 deletion lib/jaro_winkler/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module JaroWinkler
VERSION = "1.3.7"
VERSION = '1.4.0'
end
10 changes: 8 additions & 2 deletions test/test_jaro_winkler.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# encoding: utf-8
require 'minitest/autorun'
require 'jaro_winkler'

if fork
require 'jaro_winkler/jaro_winkler_ext'
Process.wait
else
require 'jaro_winkler/jaro_winkler_pure'
end

class TestJaroWinkler < Minitest::Test
def test_distance
Expand Down Expand Up @@ -102,4 +107,5 @@ def assert_distance score, str1, str2, options={}
def assert_jaro_distance score, str1, str2, options={}
assert_equal score, JaroWinkler.jaro_distance(str1, str2, options).round(4)
end

end

0 comments on commit 8c63b8a

Please sign in to comment.