Add Maths symbols

This is also the start of some slight restructuring for the extractors to re-use code and data. Issue: #26 Co-Authored-By: Fabian Winter <5821180+fdw@users.noreply.github.com> Co-Authored-By: Christophe-Marie Duquesne <chmd@chmd.fr>
fdw · Mar 6, 2020 · e366d09 · e366d09
1 parent 89212a1
commit e366d09
Show file tree

Hide file tree

Showing 3 changed files with 2,827 additions and 0 deletions.
diff --git a/extractors/extract_maths_symbols.py b/extractors/extract_maths_symbols.py
@@ -0,0 +1,49 @@
+import html
+from collections import namedtuple
+from typing import List
+
+import requests
+
+from extractors.extractor import Extractor
+
+Character = namedtuple('Character', 'char name')
+
+
+class MathExtractor(Extractor):
+
+    def fetch_math_symbols(self: 'MathExtractor') -> List[Character]:
+        print('Downloading list of maths symbols...')
+
+        data = requests.get(
+            'https://unicode.org/Public/math/latest/MathClassEx-15.txt',
+            timeout=60
+        )  # type: requests.Response
+
+        characters = []
+        for line in data.content.decode(data.encoding).split('\n'):
+            if line.startswith('#') or len(line) == 0:
+                continue
+
+            fields = line.split(';')
+
+            symbols = self.resolve_character_range(fields[0].strip())
+
+            for symbol in symbols:
+                characters.append(Character(symbol, self.names[symbol]))
+
+        return characters
+
+    def write_file(self: 'MathExtractor', symbols: List[Character]):
+        symbol_file = open(f"../picker/data/math.csv", 'w')
+
+        for character in symbols:
+            symbol_file.write(f"{character.char} {html.escape(character.name)}\n")
+
+        symbol_file.close()
+
+    def extract(self):
+        self.write_file(self.fetch_math_symbols())
+
+
+if __name__ == "__main__":
+    MathExtractor().extract()
diff --git a/extractors/extractor.py b/extractors/extractor.py
@@ -0,0 +1,41 @@
+from typing import Dict, List
+
+import requests
+
+
+class Extractor(object):
+
+    def __init__(self):
+        self.names = self.__fetch_names()
+
+    def extract(self):
+        pass
+
+    def __fetch_names(self) -> Dict[chr, str]:
+        print("Fetching all names")
+
+        response = requests.get(
+            'https://unicode.org/Public/UNIDATA/UnicodeData.txt',
+            timeout=60
+        )  # type: requests.Response
+
+        lines = response.content.decode(response.encoding).split('\n')
+        characters = {}
+
+        for line in lines:
+            if len(line) == 0:
+                continue
+            fields = line.split(';')
+            characters[chr(int(fields[0], 16))] = fields[1].title()
+
+        return characters
+
+    def resolve_character_range(self, line: str) -> List[chr]:
+        try:
+            (start, end) = line.split('..')
+            symbols = []
+            for char in range(int(start, 16), int(end, 16) + 1):
+                symbols.append(chr(char))
+            return symbols
+        except ValueError:
+            return [chr(int(line, 16))]