Skip to content

Commit

Permalink
refactor: rename functions, variables and arguments to make them more…
Browse files Browse the repository at this point in the history
… sense
  • Loading branch information
tonytonyjan committed Sep 30, 2017
1 parent 106da9c commit 71f9e95
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 37 deletions.
58 changes: 29 additions & 29 deletions ext/jaro_winkler/jaro.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,37 @@
#define DEFAULT_THRESHOLD 0.7
#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)

const LibJaroOption DEFAULT_OPT = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};

double jaro_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt){
if(!short_codes_len || !long_codes_len) return 0.0;
double jaro_distance_from_codes(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options *opt){
if(!len1 || !len2) return 0.0;

if(short_codes_len > long_codes_len){
SWAP(short_codes, long_codes);
SWAP(short_codes_len, long_codes_len);
if(len1 > len2){
SWAP(codepoints1, codepoints2);
SWAP(len1, len2);
}

if(opt->ignore_case){
for(size_t i = 0; i < short_codes_len; i++) short_codes[i] = tolower(short_codes[i]);
for(size_t i = 0; i < long_codes_len; i++) long_codes[i] = tolower(long_codes[i]);
for(size_t i = 0; i < len1; i++) codepoints1[i] = tolower(codepoints1[i]);
for(size_t i = 0; i < len2; i++) codepoints2[i] = tolower(codepoints2[i]);
}

int32_t window_size = long_codes_len/2 - 1;
int32_t window_size = len2/2 - 1;
if(window_size < 0) window_size = 0;

char short_codes_flag[short_codes_len];
char long_codes_flag[long_codes_len];
memset(short_codes_flag, 0, short_codes_len);
memset(long_codes_flag, 0, long_codes_len);
char short_codes_flag[len1];
char long_codes_flag[len2];
memset(short_codes_flag, 0, len1);
memset(long_codes_flag, 0, len2);

// count number of matching characters
size_t match_count = 0;
for(size_t i = 0; i < short_codes_len; i++){
for(size_t i = 0; i < len1; i++){
size_t left = (i >= window_size) ? i - window_size : 0;
size_t right = (i + window_size <= long_codes_len - 1) ? (i + window_size) : (long_codes_len - 1);
if(right > long_codes_len - 1) right = long_codes_len - 1;
size_t right = (i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
if(right > len2 - 1) right = len2 - 1;
for(size_t j = left; j <= right; j++){
if(!long_codes_flag[j] && short_codes[i] == long_codes[j]){
if(!long_codes_flag[j] && codepoints1[i] == codepoints2[j]){
short_codes_flag[i] = long_codes_flag[j] = 1;
match_count++;
break;
Expand All @@ -52,44 +52,44 @@ double jaro_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, u

// count number of transpositions
size_t transposition_count = 0, j = 0, k = 0;
for(size_t i = 0; i < short_codes_len; i++){
for(size_t i = 0; i < len1; i++){
if(short_codes_flag[i]){
for(j = k; j < long_codes_len; j++){
for(j = k; j < len2; j++){
if(long_codes_flag[j]){
k = j + 1;
break;
}
}
if(short_codes[i] != long_codes[j]) transposition_count++;
if(codepoints1[i] != codepoints2[j]) transposition_count++;
}
}

// count similarities in nonmatched characters
size_t similar_count = 0;
if(opt->adj_table && short_codes_len > match_count)
for(size_t i = 0; i < short_codes_len; i++)
if(opt->adj_table && len1 > match_count)
for(size_t i = 0; i < len1; i++)
if(!short_codes_flag[i])
for(size_t j = 0; j < long_codes_len; j++)
for(size_t j = 0; j < len2; j++)
if(!long_codes_flag[j])
if(adj_matrix_find(adj_matrix_default(), short_codes[i], long_codes[j])){
if(adj_matrix_find(adj_matrix_default(), codepoints1[i], codepoints2[j])){
similar_count += 3;
break;
}

double m = (double)match_count;
double t = (double)(transposition_count/2);
if(opt->adj_table) m = similar_count/10.0 + m;
return (m/short_codes_len + m/long_codes_len + (m-t)/m) / 3;
return (m/len1 + m/len2 + (m-t)/m) / 3;
}

double jaro_winkler_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt){
double jaro_distance = jaro_distance_from_codes(short_codes, short_codes_len, long_codes, long_codes_len, opt);
double jaro_winkler_distance_from_codes(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options *opt){
double jaro_distance = jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);

if(jaro_distance < opt->threshold) return jaro_distance;
else{
size_t prefix = 0;
size_t max_4 = short_codes_len > 4 ? 4 : short_codes_len;
for(prefix = 0; prefix < max_4 && short_codes[prefix] == long_codes[prefix]; prefix++);
size_t max_4 = len1 > 4 ? 4 : len1;
for(prefix = 0; prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++);
return jaro_distance + prefix*opt->weight*(1-jaro_distance);
}
}
10 changes: 5 additions & 5 deletions ext/jaro_winkler/jaro.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
#include <stddef.h>
#include <stdint.h>

typedef struct LibJaroOption{
typedef struct {
double weight, threshold;
char ignore_case, adj_table;
} LibJaroOption;
} Options;

extern const LibJaroOption DEFAULT_OPT;
extern const Options DEFAULT_OPTIONS;

double jaro_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt);
double jaro_winkler_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt);
double jaro_distance_from_codes(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options*);
double jaro_winkler_distance_from_codes(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options*);
6 changes: 3 additions & 3 deletions ext/jaro_winkler/jaro_winkler.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ VALUE rb_mJaroWinkler,

VALUE rb_jaro_winkler_distance(size_t argc, VALUE *argv, VALUE self);
VALUE rb_jaro_distance(size_t argc, VALUE *argv, VALUE self);
VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt));
VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options*));

void Init_jaro_winkler_ext(void){
rb_mJaroWinkler = rb_define_module("JaroWinkler");
Expand All @@ -19,15 +19,15 @@ void Init_jaro_winkler_ext(void){
}


VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt)){
VALUE distance(size_t argc, VALUE *argv, VALUE self, double (*distance_fn)(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options*)){
VALUE s1, s2, opt;
CodePoints cp1, cp2;

rb_scan_args(argc, argv, "2:", &s1, &s2, &opt);
codepoints_init(&cp1, s1);
codepoints_init(&cp2, s2);

LibJaroOption c_opt = DEFAULT_OPT;
Options c_opt = DEFAULT_OPTIONS;
if(TYPE(opt) == T_HASH){
VALUE weight = rb_hash_aref(opt, ID2SYM(rb_intern("weight"))),
threshold = rb_hash_aref(opt, ID2SYM(rb_intern("threshold"))),
Expand Down

0 comments on commit 71f9e95

Please sign in to comment.