JuliaStrings · stevengj · Dec 14, 2014 · Dec 8, 2014 · Dec 12, 2014
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,6 @@ bench/bench
 bench/icu
 bench/unistring
 normtest
+graphemetest
+utf8proc_data.c.new
+printproperty
diff --git a/Makefile b/Makefile
@@ -2,6 +2,7 @@
 
 CURL=curl
 RUBY=ruby
+PERL=perl
 MAKE=make
 
 # settings
@@ -24,20 +25,23 @@ all: c-library
 c-library: libmojibake.a libmojibake.$(SHLIB_EXT)
 
 clean:
-	rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt
+	rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest graphemetest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt
 	$(MAKE) -C bench clean
 
 update: utf8proc_data.c.new
+	cp -f utf8proc_data.c.new utf8proc_data.c
 
 # real targets
 
-utf8proc_data.c.new: UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
+utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt
 	$(RUBY) data_generator.rb < UnicodeData.txt > utf8proc_data.c.new
 
 UnicodeData.txt:
-
 	$(CURL) -O http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
 
+GraphemeBreakProperty.txt:
+	$(CURL) -O http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
+
 DerivedCoreProperties.txt:
 	$(CURL) -O http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
 
@@ -67,8 +71,18 @@ libmojibake.dylib: utf8proc.o
 NormalizationTest.txt:
 	$(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
 
-normtest: normtest.c utf8proc.o mojibake.h
-	$(cc) normtest.c utf8proc.o -o normtest
+GraphemeBreakTest.txt:
+	$(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
+
+normtest: normtest.c utf8proc.o mojibake.h tests.h
+	$(cc) normtest.c utf8proc.o -o $@
+
+graphemetest: graphemetest.c utf8proc.o mojibake.h tests.h
+	$(cc) graphemetest.c utf8proc.o -o $@
+
+printproperty: printproperty.c utf8proc.o mojibake.h tests.h
+	$(cc) printproperty.c utf8proc.o -o $@
 
-check: normtest NormalizationTest.txt
+check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt
 	./normtest
+	./graphemetest
diff --git a/data_generator.rb b/data_generator.rb
@@ -75,13 +75,13 @@
   end
 end
 
-$grapheme_extend_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Grapheme_Extend.*?# Total code points:/m]
-$grapheme_extend = []
-$grapheme_extend_list.each_line do |entry|
-  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
-    $1.hex.upto($2.hex) { |e2| $grapheme_extend << e2 }
-  elsif entry =~ /^[0-9A-F]+/
-    $grapheme_extend << $&.hex
+$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
+$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
+$grapheme_boundclass_list.each_line do |entry|
+  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
+    $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_" + $3.upcase }
+  elsif entry =~ /^([0-9A-F]+)\s*;\s*([A-Za-z_]+)/
+    $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_" + $2.upcase
   end
 end
 
@@ -161,18 +161,18 @@ def c_entry(comb1_indicies, comb2_indicies)
     "#{str2c bidi_class, 'BIDI_CLASS'}, " <<
     "#{str2c decomp_type, 'DECOMP_TYPE'}, " <<
     "#{ary2c decomp_mapping}, " <<
-    "#{bidi_mirrored}, " <<
+    "#{ary2c case_folding}, " <<
     "#{uppercase_mapping or -1}, " <<
     "#{lowercase_mapping or -1}, " <<
     "#{titlecase_mapping or -1}, " <<
     "#{comb1_indicies[code] ?
        (comb1_indicies[code]*comb2_indicies.keys.length) : -1
       }, #{comb2_indicies[code] or -1}, " <<
+    "#{bidi_mirrored}, " <<
     "#{$exclusions.include?(code) or $excl_version.include?(code)}, " <<
     "#{$ignorable.include?(code)}, " <<
     "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
-    "#{$grapheme_extend.include?(code)}, " <<
-    "#{ary2c case_folding}},\n"
+    "#{$grapheme_boundclass[code]}},\n"
   end
 end
 
@@ -295,7 +295,7 @@ def c_entry(comb1_indicies, comb2_indicies)
 $stdout << "};\n\n"
 
 $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << "  {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n"
+$stdout << "  {0, 0, 0, 0, NULL, NULL, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER},\n"
 properties.each { |line|
   $stdout << line
 }

diff --git a/graphemetest.c b/graphemetest.c
@@ -0,0 +1,73 @@
+#include "tests.h"
+
+int main(void)
+{
+    char *buf = NULL;
+    size_t bufsize = 0;
+    FILE *f = fopen("GraphemeBreakTest.txt", "r");
+    uint8_t src[1024];
+
+    check(f != NULL, "error opening GraphemeBreakTest.txt");
+    while (getline(&buf, &bufsize, f) > 0) {
+        size_t bi = 0, si = 0;
+        lineno += 1;
+
+        if (lineno % 100 == 0)
+            printf("checking line %zd...\n", lineno);
+
+        if (buf[0] == '#') continue;
+
+        while (buf[bi]) {
+            bi = skipspaces(buf, bi);
+            if (buf[bi] == '/') { /* grapheme break */
+                src[si++] = '/';
+                bi++;
+            }
+            else if (buf[bi] == '+') { /* no break */
+                bi++;
+            }
+            else if (buf[bi] == '#') { /* start of comments */
+                break;
+            }
+            else { /* hex-encoded codepoint */
+                bi += encode((char*) (src + si), buf + bi) - 1;
+                while (src[si]) ++si; /* advance to NUL termination */
+            }
+        }
+        if (si && src[si-1] == '/')
+            --si; /* no break after final grapheme */
+        src[si] = 0; /* NUL-terminate */
+
+        if (si) {
+            uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */
+            size_t i = 0, j = 0;
+            ssize_t glen;
+            uint8_t *g; /* utf8proc_map grapheme results */
+            while (i < si) {
+                if (src[i] != '/')
+                    utf8[j++] = src[i++];
+                else
+                    i++;
+            }
+            glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND);
+            if (glen == UTF8PROC_ERROR_INVALIDUTF8) {
+                 /* the test file contains surrogate codepoints, which are only for UTF-16 */
+                 printf("line %zd: ignoring invalid UTF-8 codepoints\n", lineno);
+            }
+            else {
+                 check(glen >= 0, "utf8proc_map error = %s",
+                       utf8proc_errmsg(glen));
+                 for (i = 0; i <= glen; ++i)
+                      if (g[i] == 0xff)
+                           g[i] = '/'; /* easier-to-read output (/ is not in test strings) */
+                 printf("line %zd\n", lineno);
+                 check(!strcmp((char*)g, (char*)src),
+                       "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src);
+            }
+            free(g);
+        }
+    }
+    fclose(f);
+    printf("Passed tests after %zd lines!\n", lineno);
+    return 0;
+}
diff --git a/mojibake.h b/mojibake.h
@@ -170,17 +170,17 @@ typedef struct utf8proc_property_struct {
   utf8proc_propval_t bidi_class;
   utf8proc_propval_t decomp_type;
   const int32_t *decomp_mapping;
-  unsigned bidi_mirrored:1;
+  const int32_t *casefold_mapping;
   int32_t uppercase_mapping;
   int32_t lowercase_mapping;
   int32_t titlecase_mapping;
   int32_t comb1st_index;
   int32_t comb2nd_index;
+  unsigned bidi_mirrored:1;
   unsigned comp_exclusion:1;
   unsigned ignorable:1;
   unsigned control_boundary:1;
-  unsigned extend:1;
-  const int32_t *casefold_mapping;
+  unsigned boundclass:4;
 } utf8proc_property_t;
 
 #define UTF8PROC_CATEGORY_LU  1
@@ -253,6 +253,21 @@ typedef struct utf8proc_property_struct {
 #define UTF8PROC_DECOMP_TYPE_FRACTION 15
 #define UTF8PROC_DECOMP_TYPE_COMPAT   16
 
+/* values for boundclass property: */
+#define UTF8PROC_BOUNDCLASS_START    0
+#define UTF8PROC_BOUNDCLASS_OTHER    1
+#define UTF8PROC_BOUNDCLASS_CR       2
+#define UTF8PROC_BOUNDCLASS_LF       3
+#define UTF8PROC_BOUNDCLASS_CONTROL  4
+#define UTF8PROC_BOUNDCLASS_EXTEND   5
+#define UTF8PROC_BOUNDCLASS_L        6
+#define UTF8PROC_BOUNDCLASS_V        7
+#define UTF8PROC_BOUNDCLASS_T        8
+#define UTF8PROC_BOUNDCLASS_LV       9
+#define UTF8PROC_BOUNDCLASS_LVT     10
+#define UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR 11
+#define UTF8PROC_BOUNDCLASS_SPACINGMARK 12
+
 DLLEXPORT extern const int8_t utf8proc_utf8class[256];
 
 DLLEXPORT const char *utf8proc_version(void);
@@ -367,6 +382,12 @@ DLLEXPORT ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options
  *           crash!
  */
 
+DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2);
+/*
+ * Given a pair of consecutive codepoints (c1,c2), return whether a grapheme break is
+ * permitted between them (as defined by the extended grapheme clusters in UAX#29).
+ */
+
 DLLEXPORT ssize_t utf8proc_map(
   const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
 );

diff --git a/normtest.c b/normtest.c
@@ -1,47 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <string.h>
-#include <stdarg.h>
-
-#include "mojibake.h"
-
-size_t lineno = 0;
-
-void check(int cond, const char *format, ...)
-{
-     if (!cond) {
-          va_list args;
-          fprintf(stderr, "line %zd: ", lineno);
-          va_start(args, format);
-          vfprintf(stderr, format, args);
-          va_end(args);
-          fprintf(stderr, "\n");
-          exit(1);
-     }
-}
-
-/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
-   separated by whitespace, and terminated by any character not in
-   [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
-   in dest, returning the number of bytes read from buf */
-size_t encode(char *dest, const char *buf)
-{
-     size_t i = 0, j, d = 0;
-     do {
-          int c;
-          while (isspace(buf[i])) ++i; /* skip whitespace */
-          for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
-               ; /* find end of hex input */
-          if (j == i) { /* no codepoint found */
-               dest[d] = 0; /* NUL-terminate destination string */
-               return i + 1;
-          }
-          check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
-          i = j; /* skip to char after hex input */
-          d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
-     } while (1);
-}
+#include "tests.h"
 
 #define CHECK_NORM(NRM, norm, src) {                                 \
     char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src);      \

diff --git a/printproperty.c b/printproperty.c
@@ -0,0 +1,45 @@
+/* simple test program to print out the utf8proc properties for a codepoint */
+
+#include "tests.h"
+
+int main(int argc, char **argv)
+{
+     int i;
+
+     for (i = 1; i < argc; ++i) {
+          int c;
+          check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]);
+          const utf8proc_property_t *p = utf8proc_get_property(c);
+          printf("U+%s:\n"
+                 "  category = %d\n"
+                 "  combining_class = %d\n"
+                 "  bidi_class = %d\n"
+                 "  decomp_type = %d\n"
+                 "  uppercase_mapping = %x\n"
+                 "  lowercase_mapping = %x\n"
+                 "  titlecase_mapping = %x\n"
+                 "  comb1st_index = %d\n"
+                 "  comb2nd_index = %d\n"
+                 "  bidi_mirrored = %d\n"
+                 "  comp_exclusion = %d\n"
+                 "  ignorable = %d\n"
+                 "  control_boundary = %d\n"
+                 "  boundclass = %d\n",
+                 argv[i],
+                 p->category,
+                 p->combining_class,
+                 p->bidi_class,
+                 p->decomp_type,
+                 p->uppercase_mapping,
+                 p->lowercase_mapping,
+                 p->titlecase_mapping,
+                 p->comb1st_index,
+                 p->comb2nd_index,
+                 p->bidi_mirrored,
+                 p->comp_exclusion,
+                 p->ignorable,
+                 p->control_boundary,
+                 p->boundclass);
+     }
+     return 0;
+}
diff --git a/tests.h b/tests.h
@@ -0,0 +1,53 @@
+/* Common functions and includes for our test programs. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include "mojibake.h"
+
+size_t lineno = 0;
+
+void check(int cond, const char *format, ...)
+{
+     if (!cond) {
+          va_list args;
+          fprintf(stderr, "line %zd: ", lineno);
+          va_start(args, format);
+          vfprintf(stderr, format, args);
+          va_end(args);
+          fprintf(stderr, "\n");
+          exit(1);
+     }
+}
+
+size_t skipspaces(const char *buf, size_t i)
+{
+    while (isspace(buf[i])) ++i;
+    return i;
+}
+
+/* if buf points to a sequence of codepoints encoded as hexadecimal strings,
+   separated by whitespace, and terminated by any character not in
+   [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
+   in dest, returning the number of bytes read from buf */
+size_t encode(char *dest, const char *buf)
+{
+     size_t i = 0, j, d = 0;
+     do {
+          int c;
+          i = skipspaces(buf, i);
+          for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j)
+               ; /* find end of hex input */
+          if (j == i) { /* no codepoint found */
+               dest[d] = 0; /* NUL-terminate destination string */
+               return i + 1;
+          }
+          check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i);
+          i = j; /* skip to char after hex input */
+          d += utf8proc_encode_char(c, (uint8_t *) (dest + d));
+     } while (1);
+}
+