fontimize.py

#!/bin/env python3

# Fontimize
# 
# A library to optimise font files for web use, by only including the characters used in the text.
# Author: David Millington, github.com/vintagedave
# License: GPLv3
#
# Wraps TTF2Web, which converts font files from TTF to WOFF2 allowing Unicode ranges to be specified.
# Allows fonts to be converted and subbsetted for text, including passing a set of HTML files and
# automatically using the characters in the HTML, plus user-visible characters in CSS files used 
# by those HTML files (such as :before and :after pseudo-elements), and then converting / subsetting
# the fonts specified by those CSS files.
# 
# Originally written as part of a private static site generator. Fontimizer is run as the final step
# of the build process, to optimise the fonts used by the site. It's now been extracted into a  
# separate library, and is available on GitHub at github.com/vintagedave/fontimize

import os
import sys
from bs4 import BeautifulSoup
from ttf2web import TTF2Web
from os import path
import tinycss2
import typing
import pathlib
from pathvalidate import ValidationError, validate_filename
    
def _get_unicode_string(char : chr, withU : bool = True) -> str:
    return ('U+' if withU else '') + hex(ord(char))[2:].upper().zfill(4) # eg U+1234

def get_used_characters_in_str(s : str) -> set[chr]:
    res : set[chr] = { " " } # Always contain space, otherwise no font file generated by TTF2Web 
    for c in s:
        res.add(c)

    # Check for some special characters and add extra variants
    if res.intersection(set("\"")):
        res.add('“')
        res.add('”')
    if res.intersection(set("\'")):
        res.add('‘')
        res.add('’')
    if res.intersection(set("-")):
        res.add('–') # en-dash
        res.add('—') # em-dash

    return res

def get_used_characters_in_html(html : str) -> set[chr]:
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text()
    return get_used_characters_in_str(text)

class charPair:
    def __init__(self, first : chr, second : chr):
        self.first = first
        self.second = second

    def __str__(self):
        return "[" + self.first + "-" + self.second + "]" # Pairs are inclusive
    
    # For print()-ing
    def __repr__(self):
        return self.__str__()
    
    def __eq__(self, other):
        if isinstance(other, charPair):
            return self.first == other.first and self.second == other.second
        return False
    
    def get_range(self):
        if self.first == self.second:
            return _get_unicode_string(self.first)
        else:
            return _get_unicode_string(self.first) + '-' + _get_unicode_string(self.second, False) # Eg "U+0061-0071"


# Taking a sorted list of characters, find the sequential subsets and return pairs of the start and end
# of each sequential subset
def _get_char_ranges(chars : list[chr]):
    chars.sort()
    if not chars:
        return []
    res : list[charPair] = []
    first : chr = chars[0]
    prev_seen : chr = first
    for c in chars[1:]:
        expected_next_char = chr(ord(prev_seen) + 1)
        if c != expected_next_char:
            # non-sequential, so time to start a new set
            pair = charPair(first, prev_seen)
            res.append(pair)
            first = c
        prev_seen = c
    # add final set if it hasn't been added yet
    if (not res) or (res[-1].second != prev_seen):
        pair = charPair(first, prev_seen)
        res.append(pair)

    return res

# Get the total size of multiple files (used for calculating font file sizes)
def _get_file_size_sum(files: list[str]) -> str:
    sum = 0
    for f in files:
        sum = sum + path.getsize(f)
    return sum
    
# Convert to human-readable size in MB or KB
def _file_size_to_readable(size : int) -> str:
    return str(round(size / 1024)) + "KB" if size < 1024 * 1024 else str(round(size / (1024 * 1024), 1)) + "MB" # nKB or n.nMB

# Takes the input text, and the fonts, and generates new font files
# Other methods (eg taking HTML files, or multiple pieces of text) all end up here
def optimise_fonts(text : str, fonts : list[str], fontpath : str = "", subsetname = "FontimizeSubset", verbose : bool = False, print_stats : bool = True) -> dict[str, typing.Any]:
    verbosity = 2 if verbose else 0 # ttf2web has 0, 1, 2, so match that to off and on

    res : dict[str, typing.Any] = {}
    res["css"] = {} # at this level there are no CSS files, include just to prevent errors for API consumer

    characters = get_used_characters_in_str(text)

    char_list = list(characters)
    if verbosity >= 2:
        print("Characters:")
        print("  " + str(char_list))
    res["chars"] = characters # set of characters used in the input text

    char_ranges = _get_char_ranges(char_list)
    if verbosity >= 2:
        print("Character ranges:")
        print("  " + str(char_ranges))
    
    uranges_str = ', '.join(r.get_range() for r in char_ranges)
    uranges = [[subsetname, uranges_str]] # subsetname here will be in the generated font, eg 'Arial.FontimizeSubset.woff2'
    if verbosity >= 2:
        print("Unicode ranges:")
        print("  " + uranges_str)  
    res["uranges"] = uranges_str # list of unicode ranges matching the characters used in the input text

    # For each font, generate a new font file using only the used characters
    # By default, place it in the same folder as the respective font, unless fontpath is specified
    res["fonts"] = {} # dict of old font path -> new font path
    for font in fonts:
        assetdir = fontpath if fontpath else path.dirname(font)
        t2w = TTF2Web(font, uranges, assetdir=assetdir)
        woff2_list = t2w.generateWoff2(verbosity=verbosity)
        # print(woff2_list)
        assert len(woff2_list) == 1 # We only expect one font file to be generated, per font input
        assert len(woff2_list[0]) == 2 # Pair of font, plus ranges -- we only care about [0], the font
        res["fonts"][font] = woff2_list[0][0]

    if verbosity >= 2:
        print("Generated the following fonts from the originals:")
        for k in res["fonts"].keys():
            print("  " + k + " ->\n    " + res["fonts"][k])

    if (verbosity >= 2) or print_stats:
        print("Results:")
        print("  Fonts processed: " + str(len(res["fonts"])))
        if (verbosity == 1): # If 2, printed above already
            print("  Generated (use verbose output for input -> generated map):")
            for k in res["fonts"].keys():
                print("    " + res["fonts"][k])
        sum_orig =  _get_file_size_sum(list(res["fonts"].keys()))
        sum_new = _get_file_size_sum(list(res["fonts"].values())) 
        print("  Total original font size: " + _file_size_to_readable(sum_orig))
        print("  Total optimised font size: " + _file_size_to_readable(sum_new))
        savings = sum_orig - sum_new;
        savings_percent = savings / sum_orig * 100 
        print("  Savings: " +  _file_size_to_readable(savings) + " less, which is " + str(round(savings_percent, 1)) + "%!")
        print("Thankyou for using Fontimize!") # A play on Font and Optimise, haha, so good pun clever. But seriously - hopefully a memorable name!

    return res

# Takes a list of strings, and otherwise does the same as optimise_fonts
def optimise_fonts_for_multiple_text(texts : list[str], fonts : list[str], fontpath : str = "", subsetname = "FontimizeSubset", verbose : bool = False, print_stats : bool = True) -> dict[str, typing.Any]:
    text = ""
    for t in texts:
        text = text + t
    return optimise_fonts(text, fonts, fontpath, verbose=verbose, print_stats=print_stats)

# Takes a list of HTML strings, and parses those to get the used text (ie ignoring HTML tags);
# then uses that to do the same as optimise_fonts
def optimise_fonts_for_html_contents(html_contents : list[str], fonts : list[str], fontpath : str = "", subsetname = "FontimizeSubset", verbose : bool = False, print_stats : bool = True) -> dict[str, typing.Any]:
    text = ""
    for html in html_contents:
        soup = BeautifulSoup(html, 'html.parser')
        text = text + soup.get_text()
    return optimise_fonts(text, fonts, fontpath, verbose=verbose, print_stats=print_stats)

def _find_font_face_urls(css_contents : str) -> list[str]:
    parsed_css = tinycss2.parse_stylesheet(css_contents)

    urls = []

    for rule in parsed_css:
        if rule.type == 'at-rule' and rule.lower_at_keyword == 'font-face':
            # Parse the @font-face rule, find all src declaractions, parse them
            font_face_rules = tinycss2.parse_declaration_list(rule.content)
            for declaration in font_face_rules:
                if declaration.type == 'declaration' and declaration.lower_name == 'src':
                    # Manually parse the declaration value to extract the URL
                    for token in declaration.value:
                        if token.type == 'function' and token.lower_name == 'url':
                            urls.append(token.arguments[0].value)
                        else:
                            continue;
                    # This is instead of:
                    # src_tokens = tinycss2.parse_component_value_list(declaration.value)
                    # which generates an error swapping U+0000 with another value. Unknown why.

    return urls

def _get_path(known_file_path : str, relative_path : str) -> str:
    base_dir = path.dirname(known_file_path)

    # Join the base directory with the relative path
    full_path = path.join(base_dir, relative_path)

    return full_path

def _extract_pseudo_elements_content(css_contents: str) -> list[str]:
    parsed_css = tinycss2.parse_stylesheet(css_contents, skip_whitespace=True)

    contents = []

    for rule in parsed_css:
        if rule.type == 'qualified-rule':
            prelude = tinycss2.serialize(rule.prelude)
            if ':before' in prelude or ':after' in prelude: # this is something like cite:before, for example
                declarations = tinycss2.parse_declaration_list(rule.content)
                for declaration in declarations:
                    if declaration.type == 'declaration' and declaration.lower_name == 'content':
                        content_value = ''.join(token.value for token in declaration.value if token.type == 'string')
                        contents.append(content_value)
    return contents

# Takes a list of files on disk
# HTML files are parsed; all others are treated as text
# First, collect all strings from those files.
# Then, also parse to get all the CSS files they use. From those CSS files, collect all the fonts they use in @font-face src,
# plus look for any additional characters that will be reflected in rendered webpage output, such as :before and :after pseudo-elements.
def optimise_fonts_for_files(files : list[str], font_output_dir = "", subsetname = "FontimizeSubset", verbose : bool = False, print_stats : bool = True, fonts : list[str] = [], addtl_text : str = "") -> dict[str, typing.Any]:
    if (len(files) == 0) and len(addtl_text) == 0: # If you specify any text, input files are optional -- note, not documented, used for cmd line app
        print("Error: No input files. Exiting.")
        res = {
            "css" : [],
            "fonts" : [],
            "chars": set(),
            "uranges": []
        }
    
    text = addtl_text
    css_files : set[str] = set()
    font_files : set[str] = set()
    for f in fonts: # user-specified input font files
        font_files.add(f)

    for f in files:
        file_ext = pathlib.Path(f).suffix.lower()
        with open(f, 'r') as file:
            if file_ext == '.html' or file_ext == '.htm':
                html = file.read()
                soup = BeautifulSoup(html, 'html.parser')

                # Extract used text
                text += soup.get_text()

                # Extract CSS files the HTML references
                for link in soup.find_all('link', href=True):
                    if 'css' in link['href']:
                        css_ref = link['href']
                        adjusted_css_path = _get_path(f, css_ref) # It'll be relative, so relative to the HTML file
                        css_files.add(adjusted_css_path)
            else: # not HTML, treat as text
                text += file.read()

    # Sanity check that there is any text to process
    if len(text) == 0:
        print("Error: No text found in the input files or additional text. Exiting.")
        res = {
            "css" : [],
            "fonts" : [],
            "chars": set(),
            "uranges": []
        }
        return res

    # Extract fonts from CSS files
    for css_file in css_files:
        with open(css_file, 'r') as file:
            css = file.read()

        # Extract the contents of all :before and :after CSS pseudo-elements; add these to the text
        pseudo_elements = _extract_pseudo_elements_content(css)
        for pe in pseudo_elements:
            text += pe

        # List of all fonts from @font-face src url: statements. This assumes they're all local files
        font_urls = _find_font_face_urls(css)
        for font_url in font_urls:
            # Only handle local files -- this does not support remote files
            adjusted_font_path = _get_path(adjusted_css_path, font_url) # Relative to the CSS file
            if path.isfile(adjusted_font_path):
                font_files.add(adjusted_font_path)
            else:
                # if verbose:
                print("Warning: Font file not found (may be remote not local?); skipping: " + font_url + " (resolved to " + adjusted_font_path + ")")

    if verbose:
        print("Found the following CSS files:")
        for css_file in css_files:
            print("  " + css_file)

        print("Found the following fonts:")
        for font_file in font_files:
            print("  " + font_file)

    # print("Found the following text:")
    # print(text)
    
    if len(font_files) == 0:
        print("Error: No fonts found in the input files. Exiting.")
        res = {
            "css" : css_files,
            "fonts" : [],
            "chars": set(),
            "uranges": []
        }
        return res

    res = optimise_fonts(text, font_files, fontpath=font_output_dir, subsetname=subsetname, verbose=verbose, print_stats=print_stats)
    res["css"] = css_files
    return res;


# Note that unit tests for this file are in tests.py; run that file to run the tests
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description="Optimize fonts to only the specific glyphs needed for your text or HTML files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    fontimize.py 1.html 2.txt
    fontimize.py --outputdir output --subsetname MySubset --verbose 1.html 2.txt
    fontimize.py --text "The fonts will contain only the glyphs in this string" --fonts "Arial.tff" "Times New Roman.ttf"
                """)
    
    parser.add_argument('inputfiles', default=[], nargs='*', help='Input files to parse: .htm and .html are parsed as HTML to extract used text, all other files are treated as text')
    parser.add_argument('-t', '--text', type=str, help='Input text to parse, specified directly on the command line')
    parser.add_argument('-f', '--fonts', default=[], nargs='*', help='Input font files')

    group_output = parser.add_argument_group('Output', 'Specify font output directory and font subset phrase in the generated filenames')
    group_output.add_argument("-o", "--outputdir", type=str, 
                        help="Directory in which to place the generated font files (default is the same directory as the original font files)",
                        default="")
    group_output.add_argument("-s", "--subsetname", type=str, 
                        help="Phrase used in the output font filenames, eg 'Arial.SubsetName.woff2'",
                        default="FontimizeSubset")
    
    group_verb = parser.add_argument_group('Verbosity', 'Control how much Fontimize prints to the console')
    group_verb.add_argument("-v", "--verbose", help="Output significant / diagnostic info about discovered files and fonts, and generated fonts and their glyphs",
                    action="store_true")
    group_verb.add_argument("-n", "--nostats", help="Do not output info about the sizes of the original and generated fonts and the amount of space saved (shown by default)",
                    action="store_true")

    args = parser.parse_args()

    # If both --text and inputfiles are specified, give an error
    if args.text and args.inputfiles:
        print("Error: Both --text and input files cannot be specified at the same time.")
        sys.exit(1)
     
    # If neither --text nor inputfiles are specified, give an error
    if not args.text and not args.inputfiles:
        print("Error: Either --text or input files must be specified.")
        sys.exit(1)

    _addtl_text = ""
    if args.text:
        _addtl_text = args.text

    # If inputfiles are specified, test they exist
    _inputfiles = []
    if args.inputfiles:
        for file in args.inputfiles:
            if not os.path.exists(file):
                print(f"Error: Input file '{file}' does not exist.")
                sys.exit(1)
        _inputfiles = args.inputfiles

    # If fonts are specified, test they exist
    _fonts = []
    if args.fonts:
        for file in args.fonts:
            if not os.path.exists(file):
                print(f"Error: Font file '{file}' does not exist.")
                sys.exit(1)
        _fonts = args.fonts

    # If outputdir is specified, test it exists
    _outputdir = ""
    if args.outputdir:
        if not os.path.exists(args.outputdir):
            print(f"Error: Output directory '{args.outputdir}' does not exist.")
            sys.exit(1)
        _outputdir = args.outputdir

    # If subsetname is specified, test it's valid
    _subsetname = ""
    if args.subsetname:
        try:
            validate_filename(args.subsetname)
        except ValidationError as e:
            print(f"Error: Subset name '{args.subsetname}' is not valid: {e}")
            sys.exit(1)
        _subsetname = args.subsetname

    _verbose = False
    if args.verbose:
        _verbose = args.verbose;
    
    _printstats = True
    if args.nostats:
        _printstats = not args.nostats

    res = optimise_fonts_for_files(_inputfiles,
                                   font_output_dir=_outputdir, 
                                   subsetname=_subsetname, 
                                   verbose=_verbose, 
                                   print_stats=_printstats, 
                                   fonts=_fonts, 
                                   addtl_text=_addtl_text)
    
    if args.verbose:              
        print("Done.")