charwidth=1 for soft hyphen and unassigned codepoints (#135)

* use width=1 for soft hyphen and for unassigned/PUA codepoints * don't count unassigned codepoints when comparing with system wcwidth * more tests * indentation fixes * NEWS for 135 * remove special-casing for arabic control characters affecting a span of numbers, which are sometimes zero-width and sometimes not * regenerate
JuliaStrings · Jul 24, 2018 · 02f4e18 · 02f4e18
1 parent 0975bf9
commit 02f4e18
Show file tree

Hide file tree

Showing 5 changed files with 1,987 additions and 1,972 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -17,6 +17,9 @@
 - `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
   case-folding still yields the standard "ss" mapping.
 
+- `utf8proc_charwidth` now returns `1` for U+00AD (soft hyphen) and
+  for unassigned/PUA codepoints ([#135]).
+
 ## Version 2.1.1 ##
 
 2018-04-27
@@ -336,3 +339,4 @@ Release of version 1.0.1
 [#132]: https://github.com/JuliaLang/utf8proc/issues/132
 [#133]: https://github.com/JuliaLang/utf8proc/issues/133
 [#134]: https://github.com/JuliaLang/utf8proc/issues/134
+[#135]: https://github.com/JuliaLang/utf8proc/issues/135
diff --git a/data/charwidths.jl b/data/charwidths.jl
@@ -20,12 +20,12 @@ import Base.UTF8proc
 
 #############################################################################
 # Use a default width of 1 for all character categories that are
-# letter/symbol/number-like.  This can be overriden by Unifont or UAX 11
+# letter/symbol/number-like, as well as for unassigned/private-use chars.
+# This can be overriden by Unifont or UAX 11
 # below, but provides a useful nonzero fallback for new codepoints when
 # a new Unicode version has been released but Unifont hasn't been updated yet.
 
 zerowidth = Set{Int}() # categories that may contain zero-width chars
-push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
@@ -36,7 +36,6 @@ push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
 push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
-push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO)
 for c in 0x0000:0x110000
     if catcode(c) ∉ zerowidth
         CharWidths[c] = 1
@@ -102,7 +101,7 @@ for line in readlines(open("EastAsianWidth.txt"))
     for c in charstart:charend
         if width=="W" || width=="F" # wide or full
             CharWidths[c]=2
-        elseif width=="Na"|| width=="H" # narrow or half
+        elseif width=="Na"|| width=="H"
             CharWidths[c]=1
         end
     end
@@ -115,9 +114,11 @@ end
 for c in keys(CharWidths)
     cat = catcode(c)
 
-    # make sure format control character (category Cf) have width 0,
-    # except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
-    if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd]
+    # make sure format control character (category Cf) have width 0
+    # (some of these, like U+0601, can have a width in some cases
+    #  but normally act like prepended combining marks.  U+fff9 etc
+    #  are also odd, but have zero width in typical terminal contexts)
+    if cat==UTF8proc.UTF8PROC_CATEGORY_CF
         CharWidths[c]=0
     end
 
@@ -128,11 +129,12 @@ for c in keys(CharWidths)
         CharWidths[c]=0
     end
 
-    # We also assign width of zero to unassigned and private-use
+    # We also assign width of one to unassigned and private-use
     # codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
-    # but since these are nonstandard it seems questionable to recognize them).
+    # but since these are nonstandard it seems questionable to use Unifont metrics;
+    # if they are printed as the replacement character U+FFFD they will have width 1).
     if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
-        CharWidths[c]=0
+        CharWidths[c]=1
     end
 
     # for some reason, Unifont has width-2 glyphs for ASCII control chars
@@ -141,6 +143,9 @@ for c in keys(CharWidths)
     end
 end
 
+#Soft hyphen is typically printed as a hyphen (-) in terminals.
+CharWidths[0x00ad]=1
+
 #By definition, should have zero width (on the same line)
 #0x002028 ' ' category: Zl name: LINE SEPARATOR/
 #0x002029 ' ' category: Zp name: PARAGRAPH SEPARATOR/
@@ -158,8 +163,8 @@ CharWidths[0x2001]=2
 CharWidths[0x2003]=2
 
 #############################################################################
-# Output (to a file or pipe) for processing by data_generator.rb
-# ... don't bother to output zero widths since that will be the default.
+# Output (to a file or pipe) for processing by data_generator.rb,
+# encoded as a sequence of intervals.
 
 firstc = 0x000000
 lastv = 0

diff --git a/data/data_generator.rb b/data/data_generator.rb
@@ -378,7 +378,7 @@ def c_entry(comb_indicies)
 $stdout << "};\n\n"
 
 $stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
-$stdout << "  {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
+$stdout << "  {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
 properties.each { |line|
   $stdout << line
 }

diff --git a/test/charwidth.c b/test/charwidth.c
@@ -2,70 +2,76 @@
 #include <ctype.h>
 #include <wchar.h>
 
+static int my_unassigned(int c) {
+    int cat = utf8proc_get_property(c)->category;
+    return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
+}
+
 static int my_isprint(int c) {
-     int cat = utf8proc_get_property(c)->category;
-     return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
-          (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd);
+    int cat = utf8proc_get_property(c)->category;
+    return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
+           (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) ||
+           (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
 }
 
 int main(int argc, char **argv)
 {
-     int c, error = 0, updates = 0;
+    int c, error = 0, updates = 0;
+
+    (void) argc; /* unused */
+    (void) argv; /* unused */
 
-     (void) argc; /* unused */
-     (void) argv; /* unused */
+    /* some simple sanity tests of the character widths */
+    for (c = 0; c <= 0x110000; ++c) {
+        int cat = utf8proc_get_property(c)->category;
+        int w = utf8proc_charwidth(c);
+        if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
+            fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
+            error += 1;
+        }
+        if (w == 0 &&
+            ((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
+             (cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
+             (cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
+            fprintf(stderr, "zero width for symbol-like char %x\n", c);
+            error += 1;
+        }
+        if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) {
+            fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
+            wcwidth(c), w,
+            isprint(c) ? "printable" : "non-printable", c);
+            error += 1;
+        }
+        if (!my_isprint(c) && w > 0) {
+            fprintf(stderr, "non-printing %x had width %d\n", c, w);
+            error += 1;
+        }
+        if (my_unassigned(c) && w != 1) {
+            fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
+            error += 1;
+        }
+    }
+    check(!error, "utf8proc_charwidth FAILED %d tests.", error);
 
-     /* some simple sanity tests of the character widths */
-     for (c = 0; c <= 0x110000; ++c) {
-          int cat = utf8proc_get_property(c)->category;
-          int w = utf8proc_charwidth(c);
-          if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) &&
-              w > 0) {
-               fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
-               error = 1;
-          }
-          if (w == 0 &&
-			  ((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
-			   (cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
-			   (cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
-               fprintf(stderr, "zero width for symbol-like char %x\n", c);
-               error = 1;
-          }
-          if (c <= 127 && ((!isprint(c) && w > 0) ||
-                           (isprint(c) && wcwidth(c) != w))) {
-               fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
-                       wcwidth(c), w, 
-                       isprint(c) ? "printable" : "non-printable", c);
-               error = 1;
-          }
-          if (!my_isprint(c) && w > 0) {
-               fprintf(stderr, "non-printing %x had width %d\n", c, w);
-               error = 1;
-          }
-     }
-     check(!error, "utf8proc_charwidth FAILED tests.");
+    check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
+    check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
 
-     /* print some other information by compariing with system wcwidth */
-     printf("Mismatches with system wcwidth (not necessarily errors):\n");
-     for (c = 0; c <= 0x110000; ++c) {
-          int w = utf8proc_charwidth(c);
-          int wc = wcwidth(c);
-          if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
-          /* lots of these errors for out-of-date system unicode tables */
-          if (wc == -1 && my_isprint(c) && w > 0) {
-			   updates += 1;
-#if 0
-               printf("  wcwidth(%x) = -1 for printable char\n", c);
-#endif
-		  }
-          if (wc == -1 && !my_isprint(c) && w > 0)
-               printf("  wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
-          if (wc >= 0 && wc != w)
-               printf("  wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
-     }
-	 printf("   ... (positive widths for %d chars unknown to wcwidth) ...\n",
-			updates);
-     printf("Character-width tests SUCCEEDED.\n");
+    /* print some other information by compariing with system wcwidth */
+    printf("Mismatches with system wcwidth (not necessarily errors):\n");
+    for (c = 0; c <= 0x110000; ++c) {
+        int w = utf8proc_charwidth(c);
+        int wc = wcwidth(c);
+        if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
+        /* lots of these errors for out-of-date system unicode tables */
+        if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0)
+            updates += 1;
+        if (wc == -1 && !my_isprint(c) && w > 0)
+            printf("  wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
+        if (wc >= 0 && wc != w)
+            printf("  wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
+    }
+    printf("   ... (positive widths for %d chars unknown to wcwidth) ...\n", updates);
+    printf("Character-width tests SUCCEEDED.\n");
 
-     return 0;
+    return 0;
 }