-
Notifications
You must be signed in to change notification settings - Fork 141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: charwidth function #27
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# Unicode data generation rules. Except for the test data files, most | ||
# users will not use these Makefile rules, which are primarily to re-generate | ||
# unicode_data.c when we get a new Unicode version or charwidth data; they | ||
# require ruby, fontforge, and julia to be installed. | ||
|
||
# programs | ||
CURL=curl | ||
RUBY=ruby | ||
PERL=perl | ||
MAKE=make | ||
JULIA=julia | ||
CURLFLAGS = --retry 5 --location | ||
|
||
# use JuliaLang caching (https://github.com/staticfloat/cache.julialang.org) | ||
# so that Travis builds do not depend on anyone's flaky servers but our own | ||
URLCACHE=https://cache.e.ip.saba.us/ | ||
|
||
.PHONY: clean | ||
|
||
.DELETE_ON_ERROR: | ||
|
||
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt | ||
$(RUBY) data_generator.rb < UnicodeData.txt > $@ | ||
|
||
# GNU Unifont version for font-metric calculations: | ||
UNIFONT_VERSION=7.0.06 | ||
|
||
unifont-$(UNIFONT_VERSION).ttf: | ||
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont-$(UNIFONT_VERSION).ttf | ||
|
||
unifont_upper-$(UNIFONT_VERSION).ttf: | ||
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)http://unifoundry.com/pub/unifont-$(UNIFONT_VERSION)/font-builds/unifont_upper-$(UNIFONT_VERSION).ttf | ||
|
||
CharWidths.txt: charwidths.jl unifont-$(UNIFONT_VERSION).ttf unifont_upper-$(UNIFONT_VERSION).ttf EastAsianWidth.txt | ||
UNIFONT_VERSION=$(UNIFONT_VERSION) $(JULIA) charwidths.jl > $@ | ||
|
||
UnicodeData.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/UnicodeData.txt | ||
|
||
EastAsianWidth.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt | ||
|
||
GraphemeBreakProperty.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt | ||
|
||
DerivedCoreProperties.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt | ||
|
||
CompositionExclusions.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt | ||
|
||
CaseFolding.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/CaseFolding.txt | ||
|
||
NormalizationTest.txt: | ||
$(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt | ||
|
||
GraphemeBreakTest.txt: | ||
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@ | ||
|
||
clean: | ||
rm -f UnicodeData.txt EastAsianWidth.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
# Following work by @jiahao, we compute character widths using a combination of | ||
# * advance widths from GNU Unifont (advance width 512 = 1 en) | ||
# * UAX 11: East Asian Width | ||
# * a few exceptions as needed | ||
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734 | ||
# | ||
# Requires Julia (obviously) and FontForge. | ||
|
||
############################################################################# | ||
# Julia 0.3/0.4 compatibility (taken from Compat package) | ||
if VERSION < v"0.4.0-dev+1419" | ||
const UInt16 = Uint16 | ||
end | ||
|
||
CharWidths = Dict{Int,Int}() | ||
|
||
############################################################################# | ||
# Widths from GNU Unifont | ||
|
||
universion=get(ENV, "UNIFONT_VERSION", "7.0.06") | ||
for fontfile in ["unifont-$universion", "unifont_upper-$universion"] | ||
isfile("$fontfile.ttf") || download("http://unifoundry.com/pub/unifont-$universion/font-builds/$fontfile.ttf", "$fontfile.ttf") | ||
isfile("$fontfile.sfd") || run(`fontforge -lang=ff -c "Open(\"$fontfile.ttf\");Save(\"$fontfile.sfd\");Quit(0);"`) | ||
end | ||
|
||
#Read sfdfile for character widths | ||
function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}()) | ||
state=:seekchar | ||
lineno = 0 | ||
for line in readlines(open(filename)) | ||
lineno += 1 | ||
if state==:seekchar #StartChar: nonmarkingreturn | ||
if contains(line, "StartChar: ") | ||
codepoint = nothing | ||
width = nothing | ||
state = :readdata | ||
end | ||
elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024 | ||
contains(line, "Encoding:") && (codepoint = int(split(line)[3])) | ||
contains(line, "Width:") && (width = int(split(line)[2])) | ||
if codepoint!=nothing && width!=nothing && codepoint >= 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jiahao, I added the codepoint>=0 check here since the sfd file seems to have some codepoint -1 entries in the beginning. |
||
CharWidths[codepoint]=div(width, 512) # 512 units to the en | ||
state = :seekchar | ||
end | ||
end | ||
end | ||
CharWidths | ||
end | ||
CharWidths=parsesfd("unifont-$universion.sfd", CharWidths) | ||
CharWidths=parsesfd("unifont_upper-$universion.sfd", CharWidths) | ||
|
||
############################################################################# | ||
# Widths from UAX #11: East Asian Width | ||
# .. these take precedence over the Unifont width for all codepoints | ||
# listed explicitly as wide/full/narrow/half-width | ||
|
||
isfile("EastAsianWidth.txt") || download("http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt", "EastAsianWidth.txt") | ||
for line in readlines(open("EastAsianWidth.txt")) | ||
#Strip comments | ||
line[1] == '#' && continue | ||
precomment = split(line, '#')[1] | ||
#Parse code point range and width code | ||
tokens = split(precomment, ';') | ||
length(tokens) >= 2 || continue | ||
charrange = tokens[1] | ||
width = strip(tokens[2]) | ||
#Parse code point range into Julia UnitRange | ||
rangetokens = split(charrange, "..") | ||
charstart = uint32("0x"*rangetokens[1]) | ||
charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1]) | ||
|
||
#Assign widths | ||
for c in charstart:charend | ||
if width=="W" || width=="F" # wide or full | ||
CharWidths[c]=2 | ||
elseif width=="Na"|| width=="H" # narrow or half | ||
CharWidths[c]=1 | ||
end | ||
end | ||
end | ||
|
||
############################################################################# | ||
# A few exceptions to the above cases, found by manual comparison | ||
# to other wcwidth functions and similar checks. | ||
|
||
# Use ../libutf8proc for category codes, rather than the one in Julia, | ||
# to minimize bootstrapping complexity when a new version of Unicode comes out. | ||
function catcode(c) | ||
uint(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs | ||
return unsafe_load(ccall((:utf8proc_get_property,"../libutf8proc"), Ptr{UInt16}, (Int32,), c)) | ||
end | ||
|
||
# use Base.UTF8proc module to get category codes constants, since | ||
# we aren't goint to change these in utf8proc. | ||
import Base.UTF8proc | ||
|
||
for c in keys(CharWidths) | ||
cat = catcode(c) | ||
|
||
# make sure format control character (category Cf) have width 0, | ||
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2) | ||
if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd] | ||
CharWidths[c]=0 | ||
end | ||
|
||
# Unifont has nonzero width for a number of non-spacing combining | ||
# characters, e.g. (in 7.0.06): f84,17b4,17b5,180b,180d,2d7f, and | ||
# the variation selectors | ||
if cat==UTF8proc.UTF8PROC_CATEGORY_MN | ||
CharWidths[c]=0 | ||
end | ||
|
||
# We also assign width of zero to unassigned and private-use | ||
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts, | ||
# but since these are nonstandard it seems questionable to recognize them). | ||
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN | ||
CharWidths[c]=0 | ||
end | ||
|
||
# for some reason, Unifont has width-2 glyphs for ASCII control chars | ||
if cat==UTF8proc.UTF8PROC_CATEGORY_CC | ||
CharWidths[c]=0 | ||
end | ||
end | ||
|
||
#By definition, should have zero width (on the same line) | ||
#0x002028 ' ' category: Zl name: LINE SEPARATOR/ | ||
#0x002029 ' ' category: Zp name: PARAGRAPH SEPARATOR/ | ||
CharWidths[0x2028]=0 | ||
CharWidths[0x2029]=0 | ||
|
||
#By definition, should be narrow = width of 1 en space | ||
#0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/ | ||
CharWidths[0x202f]=1 | ||
|
||
#By definition, should be wide = width of 1 em space | ||
#0x002001 ' ' category: Zs name: EM QUAD/ | ||
#0x002003 ' ' category: Zs name: EM SPACE/ | ||
CharWidths[0x2001]=2 | ||
CharWidths[0x2003]=2 | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jiahao, in a monospaced font, like in a terminal, aren't em and en the same? |
||
############################################################################# | ||
# Output (to a file or pipe) for processing by data_generator.rb | ||
# ... don't bother to output zero widths since that will be the default. | ||
|
||
firstc = 0x000000 | ||
lastv = 0 | ||
uhex(c) = uppercase(hex(c,4)) | ||
for c in 0x0000:0x110000 | ||
v = get(CharWidths, c, 0) | ||
if v != lastv || c == 0x110000 | ||
v < 4 || error("invalid charwidth $v for $c") | ||
if firstc+1 < c | ||
println(uhex(firstc), "..", uhex(c-1), "; ", lastv) | ||
else | ||
println(uhex(firstc), "; ", lastv) | ||
end | ||
firstc = c | ||
lastv = v | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that -lang=ff seems to be needed for recent versions of fontforge, since they changed the -c option to accept Python scripts by default rather than FontForge's own scripting language.