Added ligatures for letters represented by unicode character sequences

irori · Jun 23, 2024 · ccbfe5f · ccbfe5f
1 parent d080938
commit ccbfe5f
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 6 deletions.
diff --git a/converter/charset.py b/converter/charset.py
@@ -8,6 +8,7 @@ def __init__(self, plane):
             raise ValueError('Invalid JIS plane %d' % plane)
         self.plane = plane
         self.decoder = codecs.getdecoder('euc_jis_2004')
+        self.encoder = codecs.getencoder('euc_jis_2004')
 
     def unicode(self, cp):
         # Convert JIS to EUC-JIS-2004 and then Unicode
@@ -25,6 +26,19 @@ def unicode(self, cp):
         except UnicodeDecodeError:
             return None
 
+    def decompose(self, ustr):
+        names = []
+        for u in ustr:
+            try:
+                euc, n = self.encoder(u)
+                if self.plane == 2:
+                    names.append(f'jis2-{euc[1] - 0xa0:02}-{euc[2] - 0xa0:02}')
+                else:
+                    names.append(f'jis1-{euc[0] - 0xa0:02}-{euc[1] - 0xa0:02}')
+            except UnicodeEncodeError:
+                names.append(f'u{ord(u):04X}')
+        return ' '.join(names)
+
 
 def codeconv(charset_registry, charset_encoding):
     if re.match(r'JISX\d+(\.\d+)?', charset_registry, flags=re.IGNORECASE):

diff --git a/converter/charset_test.py b/converter/charset_test.py
@@ -60,3 +60,8 @@ def test_plane2(self):
 
         self.assertEqual(unmapped, 9)
         self.assertEqual(len(unicode_to_jis), 2436)
+
+    def test_decompose(self):
+        cconv = charset.JIS(1)
+        self.assertEqual('jis1-04-11 u309A', cconv.decompose('\u304b\u309a'))
+        self.assertEqual('jis1-11-64 jis1-11-68', cconv.decompose('\u02e5\u02e9'))
diff --git a/converter/convert.py b/converter/convert.py
@@ -48,16 +48,19 @@ def create_ufo(fonts, limit=None):
     fonts[0].set_ufo_metrics(ufo.info)
 
     vert_feature = []
+    liga_feature = []
 
     count = 0
     for font in fonts:
         for g in font.glyphs():
-            if len(g.unicode) > 1:
-                print('Cannot convert unicode sequence %s' % g.unicode, file=sys.stderr)
-                continue
-
             ufo_glyph = ufo.newGlyph(g.name())
-            ufo_glyph.unicodes = charset.variants(ord(g.unicode))
+
+            if len(g.unicode) == 1:
+                ufo_glyph.unicodes = charset.variants(ord(g.unicode))
+            else:
+                glyph_seq = font.codeconv.decompose(g.unicode)
+                liga_feature.append(' sub %s by %s;' % (glyph_seq, g.name()))
+
             ufo_glyph.width = font.width
             ufo_glyph.height = font.ascent - font.descent
             draw(g, ufo_glyph)
@@ -70,12 +73,21 @@ def create_ufo(fonts, limit=None):
                 draw(vg, ufo_vglyph)
                 vert_feature.append(' sub %s by %s;' % (g.name(), vg.name()))
 
+            if g.unicode == '\u309c':  # KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+                # Add COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK which is used in ligatures.
+                u309a = ufo.insertGlyph(ufo_glyph, 'u309A')
+                u309a.unicode = 0x309a
+
             count += 1
             if limit and count >= limit:
                 break
 
+    features = ''
     if len(vert_feature) > 0:
-        ufo.features.text = 'feature vert {\n' + '\n'.join(vert_feature) + '\n} vert;'
+        features += 'feature vert {\n' + '\n'.join(vert_feature) + '\n} vert;\n'
+    if len(liga_feature) > 0:
+        features += 'feature liga {\n' + '\n'.join(liga_feature) + '\n} liga;\n'
+    ufo.features.text = features
 
     print('%d glyphs converted' % count)
     return ufo