-
Notifications
You must be signed in to change notification settings - Fork 58
/
wcwidth-libc-comparator.py
executable file
·138 lines (106 loc) · 3.79 KB
/
wcwidth-libc-comparator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
# coding: utf-8
"""
Manual tests comparing wcwidth.py to libc's wcwidth(3) and wcswidth(3).
https://github.com/jquast/wcwidth
This suite of tests compares the libc return values with the pure-python return
values. Although wcwidth(3) is POSIX, its actual implementation may differ,
so these tests are not guaranteed to be successful on all platforms, especially
where wcwidth(3)/wcswidth(3) is out of date. This is especially true for many
platforms -- usually conforming only to unicode specification 1.0 or 2.0.
This program accepts one optional command-line argument, the unicode version
level for our library to use when comparing to libc.
"""
# pylint: disable=C0103
# Invalid module name "wcwidth-libc-comparator"
# standard imports
from __future__ import print_function
# std imports
import sys
import locale
import warnings
import ctypes.util
import unicodedata
# local
# local imports
import wcwidth
def is_named(ucs):
"""
Whether the unicode point ``ucs`` has a name.
:rtype bool
"""
try:
return bool(unicodedata.name(ucs))
except ValueError:
return False
def is_not_combining(ucs):
return not unicodedata.combining(ucs)
def report_ucs_msg(ucs, wcwidth_libc, wcwidth_local):
"""
Return string report of combining character differences.
:param ucs: unicode point.
:type ucs: unicode
:param wcwidth_libc: libc-wcwidth's reported character length.
:type comb_py: int
:param wcwidth_local: wcwidth's reported character length.
:type comb_wc: int
:rtype: unicode
"""
ucp = (ucs.encode('unicode_escape')[2:]
.decode('ascii')
.upper()
.lstrip('0'))
url = "http://codepoints.net/U+{}".format(ucp)
name = unicodedata.name(ucs)
return (u"libc,ours={},{} [--o{}o--] name={} val={} {}"
" ".format(wcwidth_libc, wcwidth_local, ucs, name, ord(ucs), url))
# use chr() for py3.x,
# unichr() for py2.x
try:
_ = unichr(0)
except NameError as err:
if err.args[0] == "name 'unichr' is not defined":
# pylint: disable=W0622
# Redefining built-in 'unichr' (col 8)
unichr = chr
else:
raise
if sys.maxunicode < 1114111:
warnings.warn('narrow Python build, only a small subset of '
'characters may be tested.')
def _is_equal_wcwidth(libc, ucs, unicode_version):
w_libc = libc.wcwidth(ucs)
w_local = wcwidth.wcwidth(ucs, unicode_version)
assert w_libc == w_local, report_ucs_msg(ucs, w_libc, w_local)
def main(using_locale=('en_US', 'UTF-8',)):
"""
Program entry point.
Load the entire Unicode table into memory, excluding those that:
- are not named (func unicodedata.name returns empty string),
- are combining characters.
Using ``locale``, for each unicode character string compare libc's
wcwidth with local wcwidth.wcwidth() function; when they differ,
report a detailed AssertionError to stdout.
"""
all_ucs = (ucs for ucs in
[unichr(val) for val in range(sys.maxunicode)]
if is_named(ucs) and is_not_combining(ucs))
libc_name = ctypes.util.find_library('c')
if not libc_name:
raise ImportError("Can't find C library.")
libc = ctypes.cdll.LoadLibrary(libc_name)
libc.wcwidth.argtypes = [ctypes.c_wchar, ]
libc.wcwidth.restype = ctypes.c_int
assert getattr(libc, 'wcwidth', None) is not None
assert getattr(libc, 'wcswidth', None) is not None
locale.setlocale(locale.LC_ALL, using_locale)
unicode_version = 'latest'
if len(sys.argv) > 1:
unicode_version = sys.argv[1]
for ucs in all_ucs:
try:
_is_equal_wcwidth(libc, ucs, unicode_version)
except AssertionError as err:
print(err)
if __name__ == '__main__':
main()