Skip to content

Commit

Permalink
charwidth=1 for soft hyphen and unassigned codepoints (#135)
Browse files Browse the repository at this point in the history
* use width=1 for soft hyphen and for unassigned/PUA codepoints

* don't count unassigned codepoints when comparing with system wcwidth

* more tests

* indentation fixes

* NEWS for 135

* remove special-casing for arabic control characters affecting a span of numbers, which are sometimes zero-width and sometimes not

* regenerate
  • Loading branch information
stevengj authored Jul 24, 2018
1 parent 0975bf9 commit 02f4e18
Show file tree
Hide file tree
Showing 5 changed files with 1,987 additions and 1,972 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
- `toupper` of ß (U+00df) now yields ẞ (U+1E9E) ([#134]), similar to musl;
case-folding still yields the standard "ss" mapping.

- `utf8proc_charwidth` now returns `1` for U+00AD (soft hyphen) and
for unassigned/PUA codepoints ([#135]).

## Version 2.1.1 ##

2018-04-27
Expand Down Expand Up @@ -336,3 +339,4 @@ Release of version 1.0.1
[#132]: https://github.com/JuliaLang/utf8proc/issues/132
[#133]: https://github.com/JuliaLang/utf8proc/issues/133
[#134]: https://github.com/JuliaLang/utf8proc/issues/134
[#135]: https://github.com/JuliaLang/utf8proc/issues/135
29 changes: 17 additions & 12 deletions data/charwidths.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ import Base.UTF8proc

#############################################################################
# Use a default width of 1 for all character categories that are
# letter/symbol/number-like. This can be overriden by Unifont or UAX 11
# letter/symbol/number-like, as well as for unassigned/private-use chars.
# This can be overriden by Unifont or UAX 11
# below, but provides a useful nonzero fallback for new codepoints when
# a new Unicode version has been released but Unifont hasn't been updated yet.

zerowidth = Set{Int}() # categories that may contain zero-width chars
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
Expand All @@ -36,7 +36,6 @@ push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO)
for c in 0x0000:0x110000
if catcode(c) zerowidth
CharWidths[c] = 1
Expand Down Expand Up @@ -102,7 +101,7 @@ for line in readlines(open("EastAsianWidth.txt"))
for c in charstart:charend
if width=="W" || width=="F" # wide or full
CharWidths[c]=2
elseif width=="Na"|| width=="H" # narrow or half
elseif width=="Na"|| width=="H"
CharWidths[c]=1
end
end
Expand All @@ -115,9 +114,11 @@ end
for c in keys(CharWidths)
cat = catcode(c)

# make sure format control character (category Cf) have width 0,
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c [0x0601,0x0602,0x0603,0x06dd]
# make sure format control character (category Cf) have width 0
# (some of these, like U+0601, can have a width in some cases
# but normally act like prepended combining marks. U+fff9 etc
# are also odd, but have zero width in typical terminal contexts)
if cat==UTF8proc.UTF8PROC_CATEGORY_CF
CharWidths[c]=0
end

Expand All @@ -128,11 +129,12 @@ for c in keys(CharWidths)
CharWidths[c]=0
end

# We also assign width of zero to unassigned and private-use
# We also assign width of one to unassigned and private-use
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
# but since these are nonstandard it seems questionable to recognize them).
# but since these are nonstandard it seems questionable to use Unifont metrics;
# if they are printed as the replacement character U+FFFD they will have width 1).
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
CharWidths[c]=0
CharWidths[c]=1
end

# for some reason, Unifont has width-2 glyphs for ASCII control chars
Expand All @@ -141,6 +143,9 @@ for c in keys(CharWidths)
end
end

#Soft hyphen is typically printed as a hyphen (-) in terminals.
CharWidths[0x00ad]=1

#By definition, should have zero width (on the same line)
#0x002028 '
' category: Zl name: LINE SEPARATOR/
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
Expand All @@ -158,8 +163,8 @@ CharWidths[0x2001]=2
CharWidths[0x2003]=2

#############################################################################
# Output (to a file or pipe) for processing by data_generator.rb
# ... don't bother to output zero widths since that will be the default.
# Output (to a file or pipe) for processing by data_generator.rb,
# encoded as a sequence of intervals.

firstc = 0x000000
lastv = 0
Expand Down
2 changes: 1 addition & 1 deletion data/data_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ def c_entry(comb_indicies)
$stdout << "};\n\n"

$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 0, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
properties.each { |line|
$stdout << line
}
Expand Down
120 changes: 63 additions & 57 deletions test/charwidth.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,70 +2,76 @@
#include <ctype.h>
#include <wchar.h>

static int my_unassigned(int c) {
int cat = utf8proc_get_property(c)->category;
return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
}

static int my_isprint(int c) {
int cat = utf8proc_get_property(c)->category;
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd);
int cat = utf8proc_get_property(c)->category;
return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
(c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) ||
(cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
}

int main(int argc, char **argv)
{
int c, error = 0, updates = 0;
int c, error = 0, updates = 0;

(void) argc; /* unused */
(void) argv; /* unused */

(void) argc; /* unused */
(void) argv; /* unused */
/* some simple sanity tests of the character widths */
for (c = 0; c <= 0x110000; ++c) {
int cat = utf8proc_get_property(c)->category;
int w = utf8proc_charwidth(c);
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
error += 1;
}
if (w == 0 &&
((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
(cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
(cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
fprintf(stderr, "zero width for symbol-like char %x\n", c);
error += 1;
}
if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) {
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
wcwidth(c), w,
isprint(c) ? "printable" : "non-printable", c);
error += 1;
}
if (!my_isprint(c) && w > 0) {
fprintf(stderr, "non-printing %x had width %d\n", c, w);
error += 1;
}
if (my_unassigned(c) && w != 1) {
fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
error += 1;
}
}
check(!error, "utf8proc_charwidth FAILED %d tests.", error);

/* some simple sanity tests of the character widths */
for (c = 0; c <= 0x110000; ++c) {
int cat = utf8proc_get_property(c)->category;
int w = utf8proc_charwidth(c);
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) &&
w > 0) {
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
error = 1;
}
if (w == 0 &&
((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
(cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
(cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
fprintf(stderr, "zero width for symbol-like char %x\n", c);
error = 1;
}
if (c <= 127 && ((!isprint(c) && w > 0) ||
(isprint(c) && wcwidth(c) != w))) {
fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
wcwidth(c), w,
isprint(c) ? "printable" : "non-printable", c);
error = 1;
}
if (!my_isprint(c) && w > 0) {
fprintf(stderr, "non-printing %x had width %d\n", c, w);
error = 1;
}
}
check(!error, "utf8proc_charwidth FAILED tests.");
check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");

/* print some other information by compariing with system wcwidth */
printf("Mismatches with system wcwidth (not necessarily errors):\n");
for (c = 0; c <= 0x110000; ++c) {
int w = utf8proc_charwidth(c);
int wc = wcwidth(c);
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
/* lots of these errors for out-of-date system unicode tables */
if (wc == -1 && my_isprint(c) && w > 0) {
updates += 1;
#if 0
printf(" wcwidth(%x) = -1 for printable char\n", c);
#endif
}
if (wc == -1 && !my_isprint(c) && w > 0)
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
if (wc >= 0 && wc != w)
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
}
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n",
updates);
printf("Character-width tests SUCCEEDED.\n");
/* print some other information by compariing with system wcwidth */
printf("Mismatches with system wcwidth (not necessarily errors):\n");
for (c = 0; c <= 0x110000; ++c) {
int w = utf8proc_charwidth(c);
int wc = wcwidth(c);
if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
/* lots of these errors for out-of-date system unicode tables */
if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0)
updates += 1;
if (wc == -1 && !my_isprint(c) && w > 0)
printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
if (wc >= 0 && wc != w)
printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
}
printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n", updates);
printf("Character-width tests SUCCEEDED.\n");

return 0;
return 0;
}
Loading

0 comments on commit 02f4e18

Please sign in to comment.