Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update graphemes for Unicode 7 #20

Merged
merged 2 commits into from
Dec 14, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@ bench/bench
bench/icu
bench/unistring
normtest
graphemetest
utf8proc_data.c.new
printproperty
26 changes: 20 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

CURL=curl
RUBY=ruby
PERL=perl
MAKE=make

# settings
Expand All @@ -24,20 +25,23 @@ all: c-library
c-library: libmojibake.a libmojibake.$(SHLIB_EXT)

clean:
rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt
rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest graphemetest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt
$(MAKE) -C bench clean

update: utf8proc_data.c.new
cp -f utf8proc_data.c.new utf8proc_data.c

# real targets

utf8proc_data.c.new: UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
$(RUBY) data_generator.rb < UnicodeData.txt > utf8proc_data.c.new

UnicodeData.txt:

$(CURL) -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt

GraphemeBreakProperty.txt:
$(CURL) -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt

DerivedCoreProperties.txt:
$(CURL) -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt

Expand Down Expand Up @@ -67,8 +71,18 @@ libmojibake.dylib: utf8proc.o
NormalizationTest.txt:
$(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt

normtest: normtest.c utf8proc.o mojibake.h
$(cc) normtest.c utf8proc.o -o normtest
GraphemeBreakTest.txt:
$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@

normtest: normtest.c utf8proc.o mojibake.h tests.h
$(cc) normtest.c utf8proc.o -o $@

graphemetest: graphemetest.c utf8proc.o mojibake.h tests.h
$(cc) graphemetest.c utf8proc.o -o $@

printproperty: printproperty.c utf8proc.o mojibake.h tests.h
$(cc) printproperty.c utf8proc.o -o $@

check: normtest NormalizationTest.txt
check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt
./normtest
./graphemetest
22 changes: 11 additions & 11 deletions data_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@
end
end

$grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m]
$grapheme_extend = []
$grapheme_extend_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
elsif entry =~ /^[0-9A-F]+/
$grapheme_extend << $&.hex
$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
$grapheme_boundclass_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
$1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
$grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
end
end

Expand Down Expand Up @@ -161,18 +161,18 @@ def c_entry(comb1_indicies, comb2_indicies)
"#{str2c bidi_class, 'BIDI_CLASS'}, " <<
"#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
"#{ary2c decomp_mapping}, " <<
"#{bidi_mirrored}, " <<
"#{ary2c case_folding}, " <<
"#{uppercase_mapping or -1}, " <<
"#{lowercase_mapping or -1}, " <<
"#{titlecase_mapping or -1}, " <<
"#{comb1_indicies[code] ?
(comb1_indicies[code]*comb2_indicies.keys.length) : -1
}, #{comb2_indicies[code] or -1}, " <<
"#{bidi_mirrored}, " <<
"#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
"#{$ignorable.include?(code)}, " <<
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
"#{$grapheme_extend.include?(code)}, " <<
"#{ary2c case_folding}},\n"
"#{$grapheme_boundclass[code]}},\n"
end
end

Expand Down Expand Up @@ -295,7 +295,7 @@ def c_entry(comb1_indicies, comb2_indicies)
$stdout << "};\n\n"

$stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
$stdout << " {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
properties.each { |line|
$stdout << line
}
Expand Down
73 changes: 73 additions & 0 deletions graphemetest.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#include "tests.h"

int main(void)
{
char *buf = NULL;
size_t bufsize = 0;
FILE *f = fopen("GraphemeBreakTest.txt", "r");
uint8_t src[1024];

check(f != NULL, "error opening GraphemeBreakTest.txt");
while (getline(&buf, &bufsize, f) > 0) {
size_t bi = 0, si = 0;
lineno += 1;

if (lineno % 100 == 0)
printf("checking line %zd...\n", lineno);

if (buf[0] == '#') continue;

while (buf[bi]) {
bi = skipspaces(buf, bi);
if (buf[bi] == '/') { /* grapheme break */
src[si++] = '/';
bi++;
}
else if (buf[bi] == '+') { /* no break */
bi++;
}
else if (buf[bi] == '#') { /* start of comments */
break;
}
else { /* hex-encoded codepoint */
bi += encode((char*) (src + si), buf + bi) - 1;
while (src[si]) ++si; /* advance to NUL termination */
}
}
if (si && src[si-1] == '/')
--si; /* no break after final grapheme */
src[si] = 0; /* NUL-terminate */

if (si) {
uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
size_t i = 0, j = 0;
ssize_t glen;
uint8_t *g; /* utf8proc_map grapheme results */
while (i < si) {
if (src[i] != '/')
utf8[j++] = src[i++];
else
i++;
}
glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
/* the test file contains surrogate codepoints, which are only for UTF-16 */
printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
}
else {
check(glen >= 0, "utf8proc_map error = %s",
utf8proc_errmsg(glen));
for (i = 0; i <= glen; ++i)
if (g[i] == 0xff)
g[i] = '/'; /* easier-to-read output (/ is not in test strings) */
printf("line %zd\n", lineno);
check(!strcmp((char*)g, (char*)src),
"grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
}
free(g);
}
}
fclose(f);
printf("Passed tests after %zd lines!\n", lineno);
return 0;
}
27 changes: 24 additions & 3 deletions mojibake.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,17 +170,17 @@ typedef struct utf8proc_property_struct {
utf8proc_propval_t bidi_class;
utf8proc_propval_t decomp_type;
const int32_t *decomp_mapping;
unsigned bidi_mirrored:1;
const int32_t *casefold_mapping;
int32_t uppercase_mapping;
int32_t lowercase_mapping;
int32_t titlecase_mapping;
int32_t comb1st_index;
int32_t comb2nd_index;
unsigned bidi_mirrored:1;
unsigned comp_exclusion:1;
unsigned ignorable:1;
unsigned control_boundary:1;
unsigned extend:1;
const int32_t *casefold_mapping;
unsigned boundclass:4;
} utf8proc_property_t;

#define UTF8PROC_CATEGORY_LU 1
Expand Down Expand Up @@ -253,6 +253,21 @@ typedef struct utf8proc_property_struct {
#define UTF8PROC_DECOMP_TYPE_FRACTION 15
#define UTF8PROC_DECOMP_TYPE_COMPAT 16

/* values for boundclass property: */
#define UTF8PROC_BOUNDCLASS_START 0
#define UTF8PROC_BOUNDCLASS_OTHER 1
#define UTF8PROC_BOUNDCLASS_CR 2
#define UTF8PROC_BOUNDCLASS_LF 3
#define UTF8PROC_BOUNDCLASS_CONTROL 4
#define UTF8PROC_BOUNDCLASS_EXTEND 5
#define UTF8PROC_BOUNDCLASS_L 6
#define UTF8PROC_BOUNDCLASS_V 7
#define UTF8PROC_BOUNDCLASS_T 8
#define UTF8PROC_BOUNDCLASS_LV 9
#define UTF8PROC_BOUNDCLASS_LVT 10
#define UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR 11
#define UTF8PROC_BOUNDCLASS_SPACINGMARK 12

DLLEXPORT extern const int8_t utf8proc_utf8class[256];

DLLEXPORT const char *utf8proc_version(void);
Expand Down Expand Up @@ -367,6 +382,12 @@ DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options
* crash!
*/

DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
/*
* Given a pair of consecutive codepoints (c1,c2), return whether a grapheme break is
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
*/

DLLEXPORT ssize_t utf8proc_map(
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
);
Expand Down
45 changes: 1 addition & 44 deletions normtest.c
Original file line number Diff line number Diff line change
@@ -1,47 +1,4 @@
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdarg.h>

#include "mojibake.h"

size_t lineno = 0;

void check(int cond, const char *format, ...)
{
if (!cond) {
va_list args;
fprintf(stderr, "line %zd: ", lineno);
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
fprintf(stderr, "\n");
exit(1);
}
}

/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
separated by whitespace, and terminated by any character not in
[0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
in dest, returning the number of bytes read from buf */
size_t encode(char *dest, const char *buf)
{
size_t i = 0, j, d = 0;
do {
int c;
while (isspace(buf[i])) ++i; /* skip whitespace */
for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
; /* find end of hex input */
if (j == i) { /* no codepoint found */
dest[d] = 0; /* NUL-terminate destination string */
return i + 1;
}
check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
i = j; /* skip to char after hex input */
d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
} while (1);
}
#include "tests.h"

#define CHECK_NORM(NRM, norm, src) { \
char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \
Expand Down
45 changes: 45 additions & 0 deletions printproperty.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/* simple test program to print out the utf8proc properties for a codepoint */

#include "tests.h"

int main(int argc, char **argv)
{
int i;

for (i = 1; i < argc; ++i) {
int c;
check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
const utf8proc_property_t *p = utf8proc_get_property(c);
printf("U+%s:\n"
" category = %d\n"
" combining_class = %d\n"
" bidi_class = %d\n"
" decomp_type = %d\n"
" uppercase_mapping = %x\n"
" lowercase_mapping = %x\n"
" titlecase_mapping = %x\n"
" comb1st_index = %d\n"
" comb2nd_index = %d\n"
" bidi_mirrored = %d\n"
" comp_exclusion = %d\n"
" ignorable = %d\n"
" control_boundary = %d\n"
" boundclass = %d\n",
argv[i],
p->category,
p->combining_class,
p->bidi_class,
p->decomp_type,
p->uppercase_mapping,
p->lowercase_mapping,
p->titlecase_mapping,
p->comb1st_index,
p->comb2nd_index,
p->bidi_mirrored,
p->comp_exclusion,
p->ignorable,
p->control_boundary,
p->boundclass);
}
return 0;
}
53 changes: 53 additions & 0 deletions tests.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* Common functions and includes for our test programs. */

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdarg.h>

#include "mojibake.h"

size_t lineno = 0;

void check(int cond, const char *format, ...)
{
if (!cond) {
va_list args;
fprintf(stderr, "line %zd: ", lineno);
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
fprintf(stderr, "\n");
exit(1);
}
}

size_t skipspaces(const char *buf, size_t i)
{
while (isspace(buf[i])) ++i;
return i;
}

/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
separated by whitespace, and terminated by any character not in
[0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
in dest, returning the number of bytes read from buf */
size_t encode(char *dest, const char *buf)
{
size_t i = 0, j, d = 0;
do {
int c;
i = skipspaces(buf, i);
for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
; /* find end of hex input */
if (j == i) { /* no codepoint found */
dest[d] = 0; /* NUL-terminate destination string */
return i + 1;
}
check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
i = j; /* skip to char after hex input */
d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
} while (1);
}

Loading