-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: support encodings other than utf-8
Do not treat every string as UTF-8 encoding, instead, use Ruby's API to get unicode codepoints. close #7
- Loading branch information
1 parent
1a37c7e
commit fe72ab4
Showing
7 changed files
with
71 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,37 @@ | ||
#include <stdint.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
#include "ruby.h" | ||
#include "ruby/encoding.h" | ||
#include "code.h" | ||
|
||
void utf_char_to_code(char *str, uint64_t *ret_code, size_t *ret_byte_length){ | ||
unsigned char first_char = str[0]; | ||
if(first_char >= 252) *ret_byte_length = 6; // 1111110x | ||
else if(first_char >= 248) *ret_byte_length = 5; // 111110xx | ||
else if(first_char >= 240) *ret_byte_length = 4; // 11110xxx | ||
else if(first_char >= 224) *ret_byte_length = 3; // 1110xxxx | ||
else if(first_char >= 192) *ret_byte_length = 2; // 110xxxxx | ||
else *ret_byte_length = 1; | ||
*ret_code = 0; | ||
memcpy(ret_code, str, *ret_byte_length); | ||
} | ||
|
||
void string_to_codes(char *str, size_t length, uint64_t **ret_codes, size_t *ret_length){ | ||
uint32_t code; | ||
char byte_length; | ||
void codepoints_init(CodePoints *codepoints, VALUE str){ | ||
int32_t n; | ||
uint32_t c; | ||
const char *ptr, *end; | ||
rb_encoding *enc; | ||
|
||
*ret_codes = calloc(length, sizeof(long long)); | ||
*ret_length = 0; | ||
codepoints->length = 0; | ||
codepoints->size = 32; | ||
codepoints->data = malloc(codepoints->size * sizeof(*codepoints->data)); | ||
str = rb_str_new_frozen(str); | ||
ptr = RSTRING_PTR(str); | ||
end = RSTRING_END(str); | ||
enc = rb_enc_get(str); | ||
|
||
for(size_t i = 0; i < length;){ | ||
size_t byte_length; | ||
utf_char_to_code(&str[i], &(*ret_codes)[*ret_length], &byte_length); | ||
*ret_length += 1; | ||
i += byte_length; | ||
while (ptr < end) { | ||
c = rb_enc_codepoint_len(ptr, end, &n, enc); | ||
if(codepoints->length == codepoints->size) { | ||
codepoints->size *= 2; | ||
codepoints->data = realloc(codepoints->data, sizeof(*codepoints->data) * codepoints->size); | ||
} | ||
codepoints->data[codepoints->length++] = c; | ||
ptr += n; | ||
} | ||
RB_GC_GUARD(str); | ||
} | ||
|
||
|
||
void codepoints_free(CodePoints *codepoints) { | ||
free(codepoints->data); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,13 @@ | ||
#pragma once | ||
#include <stdint.h> | ||
#include <stddef.h> | ||
#include "ruby.h" | ||
|
||
void utf_char_to_code(char *str, uint64_t *ret_code, size_t *ret_byte_length); | ||
void string_to_codes(char *str, size_t length, uint64_t **ret_codes, size_t *ret_length); | ||
typedef struct { | ||
uint32_t *data; | ||
size_t length; | ||
size_t size; | ||
} CodePoints; | ||
|
||
void codepoints_init(CodePoints*, VALUE str); | ||
void codepoints_free(CodePoints*); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,14 @@ | ||
#pragma once | ||
|
||
#include <stddef.h> | ||
|
||
#define DEFAULT_WEIGHT 0.1 | ||
#define DEFAULT_THRESHOLD 0.7 | ||
#include <stdint.h> | ||
|
||
typedef struct LibJaroOption{ | ||
double weight, threshold; | ||
char ignore_case, adj_table; | ||
} LibJaroOption; | ||
|
||
|
||
extern const LibJaroOption DEFAULT_OPT; | ||
double jaro_distance(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt); | ||
double jaro_winkler_distance(char *str1, size_t len1, char *str2, size_t len2, LibJaroOption *opt); | ||
|
||
double jaro_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt); | ||
double jaro_winkler_distance_from_codes(uint32_t* short_codes, size_t short_codes_len, uint32_t* long_codes, size_t long_codes_len, LibJaroOption *opt); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters