Skip to content

Commit

Permalink
add JaroWinkler.jaro_distance
Browse files Browse the repository at this point in the history
  • Loading branch information
tonytonyjan committed Dec 12, 2015
1 parent 609a56d commit 7347807
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 40 deletions.
6 changes: 3 additions & 3 deletions ext/jaro_winkler/extconf.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
unless RUBY_PLATFORM == 'java'
require "mkmf"
require 'mkmf'
$CFLAGS << ' -std=c99 '
create_makefile("jaro_winkler/jaro_winkler")
create_makefile('jaro_winkler/jaro_winkler_ext')
else
dummy_makefile = open("Makefile", "wb")
dummy_makefile = open('Makefile', 'wb')
dummy_makefile.puts '.PHONY: install'
dummy_makefile.puts 'install:'
dummy_makefile.puts "\t" + '@echo "C extension is not installed, fall back to pure Ruby version instead."'
Expand Down
71 changes: 45 additions & 26 deletions ext/jaro_winkler/jaro.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,47 @@
#include <stdlib.h>
#include <ctype.h>

double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)

double jaro_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);
double jaro_winkler_distance_from_codes(unsigned long long *codes1, int len1, unsigned long long *codes2, int len2, LibJaroOption *opt);

double jaro_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
if(!short_str_len || !long_str_len) return 0.0;

if(short_str_len > long_str_len){
SWAP(short_str, long_str);
SWAP(short_str_len, long_str_len);
}
unsigned long long *short_codes, *long_codes;
int short_codes_len, long_codes_len;
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);

double ret = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);

free(short_codes); free(long_codes);
return ret;
}

double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str, int long_str_len, LibJaroOption *opt){
if(!short_str_len || !long_str_len) return 0.0;

unsigned long long *short_codes, *long_codes;
int short_codes_len, long_codes_len;
string_to_codes(short_str, short_str_len, &short_codes, &short_codes_len);
string_to_codes(long_str, long_str_len, &long_codes, &long_codes_len);

double ret = jaro_winkler_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);

free(short_codes); free(long_codes);
return ret;
}

double jaro_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
if(!short_codes_len || !long_codes_len) return 0.0;

if(short_codes_len > long_codes_len){
SWAP(short_codes, long_codes);
SWAP(short_codes_len, long_codes_len);
}

if(opt->ignore_case){
for(int i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
for(int i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
Expand All @@ -27,10 +55,10 @@ double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str,
int window_size = long_codes_len/2 - 1;
if(window_size < 0) window_size = 0;

char short_codes_flag[short_str_len];
char long_codes_flag[long_str_len];
memset(short_codes_flag, 0, short_str_len);
memset(long_codes_flag, 0, long_str_len);
char short_codes_flag[short_codes_len];
char long_codes_flag[long_codes_len];
memset(short_codes_flag, 0, short_codes_len);
memset(long_codes_flag, 0, long_codes_len);

// count number of matching characters
int match_count = 0;
Expand All @@ -46,10 +74,8 @@ double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str,
}
}
}
if(!match_count){
free(short_codes); free(long_codes);
return 0.0;
}

if(!match_count) return 0.0;

// count number of transpositions
int transposition_count = 0, j = 0, k = 0;
Expand Down Expand Up @@ -77,27 +103,20 @@ double jaro_winkler_distance(char* short_str, int short_str_len, char* long_str,
break;
}

// jaro distance
double jaro_distance;
double m = (double)match_count;
double t = (double)(transposition_count/2);
if(opt->adj_table) m = similar_count/10.0 + m;
jaro_distance = (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
}

// jaro winkler distance
if(!opt){
static LibJaroOption default_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD};
opt = &default_opt;
}
if(jaro_distance < opt->threshold){
free(short_codes); free(long_codes);
return jaro_distance;
}
double jaro_winkler_distance_from_codes(unsigned long long* short_codes, int short_codes_len, unsigned long long* long_codes, int long_codes_len, LibJaroOption *opt){
double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);

if(jaro_distance < opt->threshold) return jaro_distance;
else{
int prefix = 0;
int max_4 = short_codes_len > 4 ? 4 : short_codes_len;
for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
free(short_codes); free(long_codes);
return jaro_distance + prefix*opt->weight*(1-jaro_distance);
}
}
4 changes: 3 additions & 1 deletion ext/jaro_winkler/jaro.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#ifndef LIBJARO_JARO_H
#define LIBJARO_JARO_H

#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
#define DEFAULT_WEIGHT 0.1
#define DEFAULT_THRESHOLD 0.7

Expand All @@ -10,6 +9,9 @@ typedef struct LibJaroOption{
char ignore_case, adj_table;
} LibJaroOption;


static const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
double jaro_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);
double jaro_winkler_distance(char *str1, int len1, char *str2, int len2, LibJaroOption *opt);

#endif
24 changes: 18 additions & 6 deletions ext/jaro_winkler/jaro_winkler.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,23 @@ VALUE rb_mJaroWinkler,
rb_eError,
rb_eInvalidWeightError;

VALUE distance(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt));

void Init_jaro_winkler(void){
void Init_jaro_winkler_ext(void){
rb_mJaroWinkler = rb_define_module("JaroWinkler");
rb_eError = rb_define_class_under(rb_mJaroWinkler, "Error", rb_eRuntimeError);
rb_eInvalidWeightError = rb_define_class_under(rb_mJaroWinkler, "InvalidWeightError", rb_eError);
rb_define_module_function(rb_mJaroWinkler, "c_distance", distance, -1);
rb_define_module_function(rb_mJaroWinkler, "distance", rb_jaro_winkler_distance, -1);
rb_define_module_function(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance, -1);
}

VALUE distance(int argc, VALUE *argv, VALUE self){

VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(char *str1, int len1, char *str2, int len2, LibJaroOption *opt)){
VALUE s1, s2, opt;
rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
LibJaroOption c_opt = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
LibJaroOption c_opt = DEFAULT_OPT;
if(TYPE(opt) == T_HASH){
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
Expand All @@ -29,5 +33,13 @@ VALUE distance(int argc, VALUE *argv, VALUE self){
if(!NIL_P(ignore_case)) c_opt.ignore_case = (TYPE(ignore_case) == T_FALSE || NIL_P(ignore_case)) ? 0 : 1;
if(!NIL_P(adj_table)) c_opt.adj_table = (TYPE(adj_table) == T_FALSE || NIL_P(adj_table)) ? 0 : 1;
}
return rb_float_new(jaro_winkler_distance(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
return rb_float_new((*distance_fn)(StringValuePtr(s1), RSTRING_LEN(s1), StringValuePtr(s2), RSTRING_LEN(s2), &c_opt));
}

VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self){
return distance(argc, argv, self, jaro_distance);
}

VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self){
return distance(argc, argv, self, jaro_winkler_distance);
}
38 changes: 34 additions & 4 deletions test/test_jaro_winkler.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# encoding: utf-8
require 'minitest/autorun'
require 'jaro_winkler'

Expand All @@ -8,7 +9,7 @@ def test_distance
assert_distance 0.9611, 'martha', 'marhta'
assert_distance 0.8324, 'jones', 'johnson'
assert_distance 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_distance 0.8400, 'dwayne', 'duane'
assert_distance 0.84, 'dwayne', 'duane'
assert_distance 0.8133, 'dixon', 'dicksonx'
assert_distance 0.0, 'fvie', 'ten'
assert_distance 1.0, 'tony', 'tony'
Expand All @@ -23,7 +24,32 @@ def test_distance
assert_distance 0.9067, 'does_exist', 'doesnt_exist'
assert_distance 0.975, '12345678', '12345687'
assert_distance 0.975, '12345678', '12345867'
assert_distance 0.95, '12345678', '12348567'
assert_distance 0.95, '12345678', '12348567'
end

def test_jaro_distance
assert_jaro_distance 0.9444, 'henka', 'henkan'
assert_jaro_distance 1.0, 'al', 'al'
assert_jaro_distance 0.9444, 'martha', 'marhta'
assert_jaro_distance 0.7905, 'jones', 'johnson'
assert_jaro_distance 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_jaro_distance 0.8222, 'dwayne', 'duane'
assert_jaro_distance 0.7667, 'dixon', 'dicksonx'
assert_jaro_distance 0.0, 'fvie', 'ten'
assert_jaro_distance 1.0, 'tony', 'tony'
assert_jaro_distance 1.0, 'tonytonyjan', 'tonytonyjan'
assert_jaro_distance 1.0, 'x', 'x'
assert_jaro_distance 0.0, '', ''
assert_jaro_distance 0.0, 'tony', ''
assert_jaro_distance 0.0, '', 'tony'
assert_jaro_distance 0.7879, 'tonytonyjan', 'tony'
assert_jaro_distance 0.7879, 'tony', 'tonytonyjan'
assert_jaro_distance 0.9259, 'necessary', 'nessecary'
assert_jaro_distance 0.8444, 'does_exist', 'doesnt_exist'
assert_jaro_distance 0.9583, '12345678', '12345687'
assert_jaro_distance 0.9583, '12345678', '12345867'
assert_jaro_distance 0.9167, '12345678', '12348567'
assert_jaro_distance 0.604, 'tonytonyjan', 'janjantony'
end

def test_unicode
Expand Down Expand Up @@ -69,7 +95,11 @@ def test_long_string

private

def assert_distance score, str1, str2, **options
assert_equal score, JaroWinkler.distance(str1, str2, **options).round(4)
def assert_distance score, str1, str2, options={}
assert_equal score, JaroWinkler.distance(str1, str2, options).round(4)
end

def assert_jaro_distance score, str1, str2, options={}
assert_equal score, JaroWinkler.jaro_distance(str1, str2, options).round(4)
end
end

0 comments on commit 7347807

Please sign in to comment.