Skip to content

Commit

Permalink
add toupper/tolower functions (for JuliaLang/julia#11471)
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed May 29, 2015
1 parent 7c14ef5 commit 7e53895
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ graphemetest
utf8proc_data.c.new
printproperty
charwidth
case
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,12 @@ test/printproperty: test/printproperty.c utf8proc.o utf8proc.h test/tests.h
test/charwidth: test/charwidth.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/charwidth.c utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth bench/bench.c bench/util.c bench/util.h utf8proc.o
test/case: test/case.c utf8proc.o utf8proc.h test/tests.h
$(cc) test/case.c utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth test/case bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
test/charwidth
test/case
50 changes: 50 additions & 0 deletions test/case.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#include "tests.h"
#include <wctype.h>

int main(int argc, char **argv)
{
int error = 0, better = 0;
utf8proc_int32_t c;

(void) argc; /* unused */
(void) argv; /* unused */

/* some simple sanity tests of the character widths */
for (c = 0; c <= 0x110000; ++c) {
utf8proc_int32_t l = utf8proc_tolower(c);
utf8proc_int32_t u = utf8proc_toupper(c);

check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");

if (sizeof(wint_t) > 2 || c < (1<<16)) {
wint_t l0 = towlower(c), u0 = towupper(c);

/* OS unicode tables may be out of date. But if they
do have a lower/uppercase mapping, hopefully it
is correct? */
if (l0 != c && l0 != l) {
fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n",
l, c, l0);
++error;
}
else if (l0 != l) { /* often true for out-of-date OS unicode */
++better;
/* printf("%x != towlower(%x) == %x\n", l, c, l0); */
}
if (u0 != c && u0 != u) {
fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n",
u, c, u0);
++error;
}
else if (u0 != u) { /* often true for out-of-date OS unicode */
++better;
/* printf("%x != towupper(%x) == %x\n", u, c, u0); */
}
}
}
check(!error, "utf8proc case conversion FAILED %d tests.", error);
printf("More up-to-date than OS unicode tables for %d tests.\n", better);
printf("utf8proc case conversion tests SUCCEEDED.\n");
return 0;
}
12 changes: 12 additions & 0 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,18 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, ut
utf8proc_get_property(c2)->boundclass);
}

UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
{
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
return cl >= 0 ? cl : c;
}

UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
{
utf8proc_int32_t cl = utf8proc_get_property(c)->uppercase_mapping;
return cl >= 0 ? cl : c;
}

/* return a character width analogous to wcwidth (except portable and
hopefully less buggy than most system wcwidth functions). */
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
Expand Down
15 changes: 15 additions & 0 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,21 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
*/
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);


/**
* Given a codepoint `c`, return the codepoint of the corresponding
* lower-case character, if any; otherwise (if there is no lower-case
* variant, or if `c` is not a valid codepoint) return `c`.
*/
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);

/**
* Given a codepoint `c`, return the codepoint of the corresponding
* upper-case character, if any; otherwise (if there is no upper-case
* variant, or if `c` is not a valid codepoint) return `c`.
*/
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);

/**
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
* except that a width of 0 is returned for non-printable codepoints
Expand Down

0 comments on commit 7e53895

Please sign in to comment.