Skip to content

Commit

Permalink
style: normalize codeing style with clang-format
Browse files Browse the repository at this point in the history
clang-format **/*.{c,h}
  • Loading branch information
tonytonyjan committed Sep 30, 2017
1 parent 71f9e95 commit 20865f4
Show file tree
Hide file tree
Showing 7 changed files with 174 additions and 120 deletions.
81 changes: 46 additions & 35 deletions ext/jaro_winkler/adj_matrix.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,62 +3,73 @@
#include "ruby.h"

const char *DEFAULT_ADJ_TABLE[] = {
"A","E", "A","I", "A","O", "A","U", "B","V", "E","I", "E","O", "E","U", "I","O", "I","U", "O","U",
"I","Y", "E","Y", "C","G", "E","F", "W","U", "W","V", "X","K", "S","Z", "X","S", "Q","C", "U","V",
"M","N", "L","I", "Q","O", "P","R", "I","J", "2","Z", "5","S", "8","B", "1","I", "1","L", "0","O",
"0","Q", "C","K", "G","J", "E"," ", "Y"," ", "S"," "
};
"A", "E", "A", "I", "A", "O", "A", "U", "B", "V", "E", "I", "E",
"O", "E", "U", "I", "O", "I", "U", "O", "U", "I", "Y", "E", "Y",
"C", "G", "E", "F", "W", "U", "W", "V", "X", "K", "S", "Z", "X",
"S", "Q", "C", "U", "V", "M", "N", "L", "I", "Q", "O", "P", "R",
"I", "J", "2", "Z", "5", "S", "8", "B", "1", "I", "1", "L", "0",
"O", "0", "Q", "C", "K", "G", "J", "E", " ", "Y", " ", "S", " "};

void node_free(Node *head);

AdjMatrix* adj_matrix_new(uint32_t length){
AdjMatrix *adj_matrix_new(uint32_t length) {
AdjMatrix *matrix = malloc(sizeof(AdjMatrix));
matrix->length = length == 0 ? ADJ_MATRIX_DEFAULT_LENGTH : length;
matrix->table = malloc(matrix->length * sizeof(Node**));
for(size_t i = 0; i < matrix->length; i++){
matrix->table[i] = malloc(matrix->length * sizeof(Node*));
matrix->table = malloc(matrix->length * sizeof(Node **));
for (size_t i = 0; i < matrix->length; i++) {
matrix->table[i] = malloc(matrix->length * sizeof(Node *));
for (size_t j = 0; j < matrix->length; j++)
matrix->table[i][j] = NULL;
}
return matrix;
}

void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y){
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
Node *new_node = malloc(sizeof(Node)); new_node->x = h1; new_node->y = h2; new_node->next = NULL;
if(matrix->table[h1][h2] == NULL){
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y) {
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
ADJ_MATRIX_DEFAULT_LENGTH,
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
ADJ_MATRIX_DEFAULT_LENGTH;
Node *new_node = malloc(sizeof(Node));
new_node->x = h1;
new_node->y = h2;
new_node->next = NULL;
if (matrix->table[h1][h2] == NULL) {
matrix->table[h1][h2] = matrix->table[h2][h1] = new_node;
}
else{
} else {
Node *previous = NULL;
for(Node *i = matrix->table[h1][h2]; i != NULL; i = i->next) previous = i;
for (Node *i = matrix->table[h1][h2]; i != NULL; i = i->next)
previous = i;
previous->next = new_node;
}
}

char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y){
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH,
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) % ADJ_MATRIX_DEFAULT_LENGTH;
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y) {
uint32_t h1 = st_hash(&x, sizeof(x), ADJ_MATRIX_SEED) %
ADJ_MATRIX_DEFAULT_LENGTH,
h2 = st_hash(&y, sizeof(y), ADJ_MATRIX_SEED) %
ADJ_MATRIX_DEFAULT_LENGTH;
Node *node = matrix->table[h1][h2];
if(node == NULL) return 0;
else{
for(Node *i = node; i != NULL; i = i->next)
if((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1)) return 1;
if (node == NULL)
return 0;
else {
for (Node *i = node; i != NULL; i = i->next)
if ((i->x == h1 && i->y == h2) || (i->x == h2 && i->y == h1))
return 1;
return 0;
}
}

void node_free(Node *head){
if(head == NULL) return;
void node_free(Node *head) {
if (head == NULL)
return;
node_free(head->next);
free(head);
}

void adj_matrix_free(AdjMatrix *matrix){
for(size_t i = 0; i < matrix->length; i++){
for(size_t j = 0; j < matrix->length; j++)
if(matrix->table[i][j] != NULL){
void adj_matrix_free(AdjMatrix *matrix) {
for (size_t i = 0; i < matrix->length; i++) {
for (size_t j = 0; j < matrix->length; j++)
if (matrix->table[i][j] != NULL) {
node_free(matrix->table[i][j]);
matrix->table[i][j] = matrix->table[j][i] = NULL;
}
Expand All @@ -68,17 +79,17 @@ void adj_matrix_free(AdjMatrix *matrix){
free(matrix);
}

AdjMatrix* adj_matrix_default(){
AdjMatrix *adj_matrix_default() {
static char first_time = 1;
static AdjMatrix *ret_matrix;
if(first_time){
if (first_time) {
ret_matrix = adj_matrix_new(ADJ_MATRIX_DEFAULT_LENGTH);
size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char*);
for(size_t i = 0; i < length; i += 2){
size_t length = sizeof(DEFAULT_ADJ_TABLE) / sizeof(char *);
for (size_t i = 0; i < length; i += 2) {
uint64_t code_1, code_2;
size_t dummy_length;
code_1 = *DEFAULT_ADJ_TABLE[i] & 0xff;
code_2 = *DEFAULT_ADJ_TABLE[i+1] & 0xff;
code_2 = *DEFAULT_ADJ_TABLE[i + 1] & 0xff;
adj_matrix_add(ret_matrix, code_1, code_2);
}
first_time = 0;
Expand Down
14 changes: 7 additions & 7 deletions ext/jaro_winkler/adj_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@
#define ADJ_MATRIX_DEFAULT_LENGTH 958
#define ADJ_MATRIX_SEED 9527

typedef struct _node{
typedef struct _node {
struct _node *next;
uint64_t x, y;
} Node;

typedef struct{
typedef struct {
Node ***table;
uint32_t length;
} AdjMatrix;

AdjMatrix* adj_matrix_new (uint32_t length);
void adj_matrix_add (AdjMatrix *matrix, uint64_t x, uint64_t y);
char adj_matrix_find (AdjMatrix *matrix, uint64_t x, uint64_t y);
void adj_matrix_free (AdjMatrix *matrix);
AdjMatrix* adj_matrix_default();
AdjMatrix *adj_matrix_new(uint32_t length);
void adj_matrix_add(AdjMatrix *matrix, uint64_t x, uint64_t y);
char adj_matrix_find(AdjMatrix *matrix, uint64_t x, uint64_t y);
void adj_matrix_free(AdjMatrix *matrix);
AdjMatrix *adj_matrix_default();
22 changes: 10 additions & 12 deletions ext/jaro_winkler/codepoints.c
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#include "codepoints.h"
#include "ruby.h"
#include "ruby/encoding.h"
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "ruby.h"
#include "ruby/encoding.h"
#include "codepoints.h"

void codepoints_init(CodePoints *codepoints, VALUE str){
void codepoints_init(CodePoints *codepoints, VALUE str) {
int32_t n;
uint32_t c;
const char *ptr, *end;
Expand All @@ -20,18 +20,16 @@ void codepoints_init(CodePoints *codepoints, VALUE str){
enc = rb_enc_get(str);

while (ptr < end) {
c = rb_enc_codepoint_len(ptr, end, &n, enc);
if(codepoints->length == codepoints->size) {
c = rb_enc_codepoint_len(ptr, end, &n, enc);
if (codepoints->length == codepoints->size) {
codepoints->size *= 2;
codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) * codepoints->size);
codepoints->data = realloc(codepoints->data,
sizeof(*codepoints->data) * codepoints->size);
}
codepoints->data[codepoints->length++] = c;
ptr += n;
ptr += n;
}
RB_GC_GUARD(str);
}


void codepoints_free(CodePoints *codepoints) {
free(codepoints->data);
}
void codepoints_free(CodePoints *codepoints) { free(codepoints->data); }
8 changes: 4 additions & 4 deletions ext/jaro_winkler/codepoints.h
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#pragma once
#include <stdint.h>
#include <stddef.h>
#include "ruby.h"
#include <stddef.h>
#include <stdint.h>

typedef struct {
uint32_t *data;
size_t length;
size_t size;
} CodePoints;

void codepoints_init(CodePoints*, VALUE str);
void codepoints_free(CodePoints*);
void codepoints_init(CodePoints *, VALUE str);
void codepoints_free(CodePoints *);
104 changes: 65 additions & 39 deletions ext/jaro_winkler/jaro.c
Original file line number Diff line number Diff line change
@@ -1,32 +1,46 @@
#include "jaro.h"
#include "codepoints.h"
#include "adj_matrix.h"
#include "codepoints.h"

#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#define DEFAULT_WEIGHT 0.1
#define DEFAULT_THRESHOLD 0.7
#define SWAP(x, y) do{ __typeof__(x) SWAP = x; x = y; y = SWAP; }while(0)
#define SWAP(x, y) \
do { \
__typeof__(x) SWAP = x; \
x = y; \
y = SWAP; \
} while (0)

const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT, .threshold = DEFAULT_THRESHOLD, .ignore_case = 0, .adj_table = 0};
const Options DEFAULT_OPTIONS = {.weight = DEFAULT_WEIGHT,
.threshold = DEFAULT_THRESHOLD,
.ignore_case = 0,
.adj_table = 0};

double jaro_distance_from_codes(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options *opt){
if(!len1 || !len2) return 0.0;
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Options *opt) {
if (!len1 || !len2)
return 0.0;

if(len1 > len2){
if (len1 > len2) {
SWAP(codepoints1, codepoints2);
SWAP(len1, len2);
}

if(opt->ignore_case){
for(size_t i = 0; i < len1; i++) codepoints1[i] = tolower(codepoints1[i]);
for(size_t i = 0; i < len2; i++) codepoints2[i] = tolower(codepoints2[i]);
if (opt->ignore_case) {
for (size_t i = 0; i < len1; i++)
codepoints1[i] = tolower(codepoints1[i]);
for (size_t i = 0; i < len2; i++)
codepoints2[i] = tolower(codepoints2[i]);
}

int32_t window_size = len2/2 - 1;
if(window_size < 0) window_size = 0;
int32_t window_size = len2 / 2 - 1;
if (window_size < 0)
window_size = 0;

char short_codes_flag[len1];
char long_codes_flag[len2];
Expand All @@ -35,61 +49,73 @@ double jaro_distance_from_codes(uint32_t* codepoints1, size_t len1, uint32_t* co

// count number of matching characters
size_t match_count = 0;
for(size_t i = 0; i < len1; i++){
for (size_t i = 0; i < len1; i++) {
size_t left = (i >= window_size) ? i - window_size : 0;
size_t right = (i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
if(right > len2 - 1) right = len2 - 1;
for(size_t j = left; j <= right; j++){
if(!long_codes_flag[j] && codepoints1[i] == codepoints2[j]){
size_t right =
(i + window_size <= len2 - 1) ? (i + window_size) : (len2 - 1);
if (right > len2 - 1)
right = len2 - 1;
for (size_t j = left; j <= right; j++) {
if (!long_codes_flag[j] && codepoints1[i] == codepoints2[j]) {
short_codes_flag[i] = long_codes_flag[j] = 1;
match_count++;
break;
}
}
}

if(!match_count) return 0.0;
if (!match_count)
return 0.0;

// count number of transpositions
size_t transposition_count = 0, j = 0, k = 0;
for(size_t i = 0; i < len1; i++){
if(short_codes_flag[i]){
for(j = k; j < len2; j++){
if(long_codes_flag[j]){
for (size_t i = 0; i < len1; i++) {
if (short_codes_flag[i]) {
for (j = k; j < len2; j++) {
if (long_codes_flag[j]) {
k = j + 1;
break;
}
}
if(codepoints1[i] != codepoints2[j]) transposition_count++;
if (codepoints1[i] != codepoints2[j])
transposition_count++;
}
}

// count similarities in nonmatched characters
size_t similar_count = 0;
if(opt->adj_table && len1 > match_count)
for(size_t i = 0; i < len1; i++)
if(!short_codes_flag[i])
for(size_t j = 0; j < len2; j++)
if(!long_codes_flag[j])
if(adj_matrix_find(adj_matrix_default(), codepoints1[i], codepoints2[j])){
if (opt->adj_table && len1 > match_count)
for (size_t i = 0; i < len1; i++)
if (!short_codes_flag[i])
for (size_t j = 0; j < len2; j++)
if (!long_codes_flag[j])
if (adj_matrix_find(adj_matrix_default(), codepoints1[i],
codepoints2[j])) {
similar_count += 3;
break;
}

double m = (double)match_count;
double t = (double)(transposition_count/2);
if(opt->adj_table) m = similar_count/10.0 + m;
return (m/len1 + m/len2 + (m-t)/m) / 3;
double t = (double)(transposition_count / 2);
if (opt->adj_table)
m = similar_count / 10.0 + m;
return (m / len1 + m / len2 + (m - t) / m) / 3;
}

double jaro_winkler_distance_from_codes(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options *opt){
double jaro_distance = jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Options *opt) {
double jaro_distance =
jaro_distance_from_codes(codepoints1, len1, codepoints2, len2, opt);

if(jaro_distance < opt->threshold) return jaro_distance;
else{
if (jaro_distance < opt->threshold)
return jaro_distance;
else {
size_t prefix = 0;
size_t max_4 = len1 > 4 ? 4 : len1;
for(prefix = 0; prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++);
return jaro_distance + prefix*opt->weight*(1-jaro_distance);
for (prefix = 0;
prefix < max_4 && codepoints1[prefix] == codepoints2[prefix]; prefix++)
;
return jaro_distance + prefix * opt->weight * (1 - jaro_distance);
}
}
7 changes: 5 additions & 2 deletions ext/jaro_winkler/jaro.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@ typedef struct {

extern const Options DEFAULT_OPTIONS;

double jaro_distance_from_codes(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options*);
double jaro_winkler_distance_from_codes(uint32_t* codepoints1, size_t len1, uint32_t* codepoints2, size_t len2, Options*);
double jaro_distance_from_codes(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2, Options *);
double jaro_winkler_distance_from_codes(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Options *);
Loading

0 comments on commit 20865f4

Please sign in to comment.