-
Notifications
You must be signed in to change notification settings - Fork 2
/
fontimize.py
438 lines (368 loc) · 18.5 KB
/
fontimize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
#!/bin/env python3
# Fontimize
#
# A library to optimise font files for web use, by only including the characters used in the text.
# Author: David Millington, github.com/vintagedave
# License: GPLv3
#
# Wraps TTF2Web, which converts font files from TTF to WOFF2 allowing Unicode ranges to be specified.
# Allows fonts to be converted and subbsetted for text, including passing a set of HTML files and
# automatically using the characters in the HTML, plus user-visible characters in CSS files used
# by those HTML files (such as :before and :after pseudo-elements), and then converting / subsetting
# the fonts specified by those CSS files.
#
# Originally written as part of a private static site generator. Fontimizer is run as the final step
# of the build process, to optimise the fonts used by the site. It's now been extracted into a
# separate library, and is available on GitHub at github.com/vintagedave/fontimize
import os
import sys
from bs4 import BeautifulSoup
from ttf2web import TTF2Web
from os import path
import tinycss2
import typing
import pathlib
from pathvalidate import ValidationError, validate_filename
def _get_unicode_string(char : chr, withU : bool = True) -> str:
return ('U+' if withU else '') + hex(ord(char))[2:].upper().zfill(4) # eg U+1234
def get_used_characters_in_str(s : str) -> set[chr]:
res : set[chr] = { " " } # Always contain space, otherwise no font file generated by TTF2Web
for c in s:
res.add(c)
# Check for some special characters and add extra variants
if res.intersection(set("\"")):
res.add('“')
res.add('”')
if res.intersection(set("\'")):
res.add('‘')
res.add('’')
if res.intersection(set("-")):
res.add('–') # en-dash
res.add('—') # em-dash
return res
def get_used_characters_in_html(html : str) -> set[chr]:
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text()
return get_used_characters_in_str(text)
class charPair:
def __init__(self, first : chr, second : chr):
self.first = first
self.second = second
def __str__(self):
return "[" + self.first + "-" + self.second + "]" # Pairs are inclusive
# For print()-ing
def __repr__(self):
return self.__str__()
def __eq__(self, other):
if isinstance(other, charPair):
return self.first == other.first and self.second == other.second
return False
def get_range(self):
if self.first == self.second:
return _get_unicode_string(self.first)
else:
return _get_unicode_string(self.first) + '-' + _get_unicode_string(self.second, False) # Eg "U+0061-0071"
# Taking a sorted list of characters, find the sequential subsets and return pairs of the start and end
# of each sequential subset
def _get_char_ranges(chars : list[chr]):
chars.sort()
if not chars:
return []
res : list[charPair] = []
first : chr = chars[0]
prev_seen : chr = first
for c in chars[1:]:
expected_next_char = chr(ord(prev_seen) + 1)
if c != expected_next_char:
# non-sequential, so time to start a new set
pair = charPair(first, prev_seen)
res.append(pair)
first = c
prev_seen = c
# add final set if it hasn't been added yet
if (not res) or (res[-1].second != prev_seen):
pair = charPair(first, prev_seen)
res.append(pair)
return res
# Get the total size of multiple files (used for calculating font file sizes)
def _get_file_size_sum(files: list[str]) -> str:
sum = 0
for f in files:
sum = sum + path.getsize(f)
return sum
# Convert to human-readable size in MB or KB
def _file_size_to_readable(size : int) -> str:
return str(round(size / 1024)) + "KB" if size < 1024 * 1024 else str(round(size / (1024 * 1024), 1)) + "MB" # nKB or n.nMB
# Takes the input text, and the fonts, and generates new font files
# Other methods (eg taking HTML files, or multiple pieces of text) all end up here
def optimise_fonts(text : str, fonts : list[str], fontpath : str = "", subsetname = "FontimizeSubset", verbose : bool = False, print_stats : bool = True) -> dict[str, typing.Any]:
verbosity = 2 if verbose else 0 # ttf2web has 0, 1, 2, so match that to off and on
res : dict[str, typing.Any] = {}
res["css"] = {} # at this level there are no CSS files, include just to prevent errors for API consumer
characters = get_used_characters_in_str(text)
char_list = list(characters)
if verbosity >= 2:
print("Characters:")
print(" " + str(char_list))
res["chars"] = characters # set of characters used in the input text
char_ranges = _get_char_ranges(char_list)
if verbosity >= 2:
print("Character ranges:")
print(" " + str(char_ranges))
uranges_str = ', '.join(r.get_range() for r in char_ranges)
uranges = [[subsetname, uranges_str]] # subsetname here will be in the generated font, eg 'Arial.FontimizeSubset.woff2'
if verbosity >= 2:
print("Unicode ranges:")
print(" " + uranges_str)
res["uranges"] = uranges_str # list of unicode ranges matching the characters used in the input text
# For each font, generate a new font file using only the used characters
# By default, place it in the same folder as the respective font, unless fontpath is specified
res["fonts"] = {} # dict of old font path -> new font path
for font in fonts:
assetdir = fontpath if fontpath else path.dirname(font)
t2w = TTF2Web(font, uranges, assetdir=assetdir)
woff2_list = t2w.generateWoff2(verbosity=verbosity)
# print(woff2_list)
assert len(woff2_list) == 1 # We only expect one font file to be generated, per font input
assert len(woff2_list[0]) == 2 # Pair of font, plus ranges -- we only care about [0], the font
res["fonts"][font] = woff2_list[0][0]
if verbosity >= 2:
print("Generated the following fonts from the originals:")
for k in res["fonts"].keys():
print(" " + k + " ->\n " + res["fonts"][k])
if (verbosity >= 2) or print_stats:
print("Results:")
print(" Fonts processed: " + str(len(res["fonts"])))
if (verbosity == 1): # If 2, printed above already
print(" Generated (use verbose output for input -> generated map):")
for k in res["fonts"].keys():
print(" " + res["fonts"][k])
sum_orig = _get_file_size_sum(list(res["fonts"].keys()))
sum_new = _get_file_size_sum(list(res["fonts"].values()))
print(" Total original font size: " + _file_size_to_readable(sum_orig))
print(" Total optimised font size: " + _file_size_to_readable(sum_new))
savings = sum_orig - sum_new;
savings_percent = savings / sum_orig * 100
print(" Savings: " + _file_size_to_readable(savings) + " less, which is " + str(round(savings_percent, 1)) + "%!")
print("Thankyou for using Fontimize!") # A play on Font and Optimise, haha, so good pun clever. But seriously - hopefully a memorable name!
return res
# Takes a list of strings, and otherwise does the same as optimise_fonts
def optimise_fonts_for_multiple_text(texts : list[str], fonts : list[str], fontpath : str = "", subsetname = "FontimizeSubset", verbose : bool = False, print_stats : bool = True) -> dict[str, typing.Any]:
text = ""
for t in texts:
text = text + t
return optimise_fonts(text, fonts, fontpath, verbose=verbose, print_stats=print_stats)
# Takes a list of HTML strings, and parses those to get the used text (ie ignoring HTML tags);
# then uses that to do the same as optimise_fonts
def optimise_fonts_for_html_contents(html_contents : list[str], fonts : list[str], fontpath : str = "", subsetname = "FontimizeSubset", verbose : bool = False, print_stats : bool = True) -> dict[str, typing.Any]:
text = ""
for html in html_contents:
soup = BeautifulSoup(html, 'html.parser')
text = text + soup.get_text()
return optimise_fonts(text, fonts, fontpath, verbose=verbose, print_stats=print_stats)
def _find_font_face_urls(css_contents : str) -> list[str]:
parsed_css = tinycss2.parse_stylesheet(css_contents)
urls = []
for rule in parsed_css:
if rule.type == 'at-rule' and rule.lower_at_keyword == 'font-face':
# Parse the @font-face rule, find all src declaractions, parse them
font_face_rules = tinycss2.parse_declaration_list(rule.content)
for declaration in font_face_rules:
if declaration.type == 'declaration' and declaration.lower_name == 'src':
# Manually parse the declaration value to extract the URL
for token in declaration.value:
if token.type == 'function' and token.lower_name == 'url':
urls.append(token.arguments[0].value)
else:
continue;
# This is instead of:
# src_tokens = tinycss2.parse_component_value_list(declaration.value)
# which generates an error swapping U+0000 with another value. Unknown why.
return urls
def _get_path(known_file_path : str, relative_path : str) -> str:
base_dir = path.dirname(known_file_path)
# Join the base directory with the relative path
full_path = path.join(base_dir, relative_path)
return full_path
def _extract_pseudo_elements_content(css_contents: str) -> list[str]:
parsed_css = tinycss2.parse_stylesheet(css_contents, skip_whitespace=True)
contents = []
for rule in parsed_css:
if rule.type == 'qualified-rule':
prelude = tinycss2.serialize(rule.prelude)
if ':before' in prelude or ':after' in prelude: # this is something like cite:before, for example
declarations = tinycss2.parse_declaration_list(rule.content)
for declaration in declarations:
if declaration.type == 'declaration' and declaration.lower_name == 'content':
content_value = ''.join(token.value for token in declaration.value if token.type == 'string')
contents.append(content_value)
return contents
# Takes a list of files on disk
# HTML files are parsed; all others are treated as text
# First, collect all strings from those files.
# Then, also parse to get all the CSS files they use. From those CSS files, collect all the fonts they use in @font-face src,
# plus look for any additional characters that will be reflected in rendered webpage output, such as :before and :after pseudo-elements.
def optimise_fonts_for_files(files : list[str], font_output_dir = "", subsetname = "FontimizeSubset", verbose : bool = False, print_stats : bool = True, fonts : list[str] = [], addtl_text : str = "") -> dict[str, typing.Any]:
if (len(files) == 0) and len(addtl_text) == 0: # If you specify any text, input files are optional -- note, not documented, used for cmd line app
print("Error: No input files. Exiting.")
res = {
"css" : [],
"fonts" : [],
"chars": set(),
"uranges": []
}
text = addtl_text
css_files : set[str] = set()
font_files : set[str] = set()
for f in fonts: # user-specified input font files
font_files.add(f)
for f in files:
file_ext = pathlib.Path(f).suffix.lower()
with open(f, 'r') as file:
if file_ext == '.html' or file_ext == '.htm':
html = file.read()
soup = BeautifulSoup(html, 'html.parser')
# Extract used text
text += soup.get_text()
# Extract CSS files the HTML references
for link in soup.find_all('link', href=True):
if 'css' in link['href']:
css_ref = link['href']
adjusted_css_path = _get_path(f, css_ref) # It'll be relative, so relative to the HTML file
css_files.add(adjusted_css_path)
else: # not HTML, treat as text
text += file.read()
# Sanity check that there is any text to process
if len(text) == 0:
print("Error: No text found in the input files or additional text. Exiting.")
res = {
"css" : [],
"fonts" : [],
"chars": set(),
"uranges": []
}
return res
# Extract fonts from CSS files
for css_file in css_files:
with open(css_file, 'r') as file:
css = file.read()
# Extract the contents of all :before and :after CSS pseudo-elements; add these to the text
pseudo_elements = _extract_pseudo_elements_content(css)
for pe in pseudo_elements:
text += pe
# List of all fonts from @font-face src url: statements. This assumes they're all local files
font_urls = _find_font_face_urls(css)
for font_url in font_urls:
# Only handle local files -- this does not support remote files
adjusted_font_path = _get_path(adjusted_css_path, font_url) # Relative to the CSS file
if path.isfile(adjusted_font_path):
font_files.add(adjusted_font_path)
else:
# if verbose:
print("Warning: Font file not found (may be remote not local?); skipping: " + font_url + " (resolved to " + adjusted_font_path + ")")
if verbose:
print("Found the following CSS files:")
for css_file in css_files:
print(" " + css_file)
print("Found the following fonts:")
for font_file in font_files:
print(" " + font_file)
# print("Found the following text:")
# print(text)
if len(font_files) == 0:
print("Error: No fonts found in the input files. Exiting.")
res = {
"css" : css_files,
"fonts" : [],
"chars": set(),
"uranges": []
}
return res
res = optimise_fonts(text, font_files, fontpath=font_output_dir, subsetname=subsetname, verbose=verbose, print_stats=print_stats)
res["css"] = css_files
return res;
# Note that unit tests for this file are in tests.py; run that file to run the tests
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description="Optimize fonts to only the specific glyphs needed for your text or HTML files",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
fontimize.py 1.html 2.txt
fontimize.py --outputdir output --subsetname MySubset --verbose 1.html 2.txt
fontimize.py --text "The fonts will contain only the glyphs in this string" --fonts "Arial.tff" "Times New Roman.ttf"
""")
parser.add_argument('inputfiles', default=[], nargs='*', help='Input files to parse: .htm and .html are parsed as HTML to extract used text, all other files are treated as text')
parser.add_argument('-t', '--text', type=str, help='Input text to parse, specified directly on the command line')
parser.add_argument('-f', '--fonts', default=[], nargs='*', help='Input font files')
group_output = parser.add_argument_group('Output', 'Specify font output directory and font subset phrase in the generated filenames')
group_output.add_argument("-o", "--outputdir", type=str,
help="Directory in which to place the generated font files (default is the same directory as the original font files)",
default="")
group_output.add_argument("-s", "--subsetname", type=str,
help="Phrase used in the output font filenames, eg 'Arial.SubsetName.woff2'",
default="FontimizeSubset")
group_verb = parser.add_argument_group('Verbosity', 'Control how much Fontimize prints to the console')
group_verb.add_argument("-v", "--verbose", help="Output significant / diagnostic info about discovered files and fonts, and generated fonts and their glyphs",
action="store_true")
group_verb.add_argument("-n", "--nostats", help="Do not output info about the sizes of the original and generated fonts and the amount of space saved (shown by default)",
action="store_true")
args = parser.parse_args()
# If both --text and inputfiles are specified, give an error
if args.text and args.inputfiles:
print("Error: Both --text and input files cannot be specified at the same time.")
sys.exit(1)
# If neither --text nor inputfiles are specified, give an error
if not args.text and not args.inputfiles:
print("Error: Either --text or input files must be specified.")
sys.exit(1)
_addtl_text = ""
if args.text:
_addtl_text = args.text
# If inputfiles are specified, test they exist
_inputfiles = []
if args.inputfiles:
for file in args.inputfiles:
if not os.path.exists(file):
print(f"Error: Input file '{file}' does not exist.")
sys.exit(1)
_inputfiles = args.inputfiles
# If fonts are specified, test they exist
_fonts = []
if args.fonts:
for file in args.fonts:
if not os.path.exists(file):
print(f"Error: Font file '{file}' does not exist.")
sys.exit(1)
_fonts = args.fonts
# If outputdir is specified, test it exists
_outputdir = ""
if args.outputdir:
if not os.path.exists(args.outputdir):
print(f"Error: Output directory '{args.outputdir}' does not exist.")
sys.exit(1)
_outputdir = args.outputdir
# If subsetname is specified, test it's valid
_subsetname = ""
if args.subsetname:
try:
validate_filename(args.subsetname)
except ValidationError as e:
print(f"Error: Subset name '{args.subsetname}' is not valid: {e}")
sys.exit(1)
_subsetname = args.subsetname
_verbose = False
if args.verbose:
_verbose = args.verbose;
_printstats = True
if args.nostats:
_printstats = not args.nostats
res = optimise_fonts_for_files(_inputfiles,
font_output_dir=_outputdir,
subsetname=_subsetname,
verbose=_verbose,
print_stats=_printstats,
fonts=_fonts,
addtl_text=_addtl_text)
if args.verbose:
print("Done.")