Skip to content

Commit

Permalink
Add Maths symbols
Browse files Browse the repository at this point in the history
This is also the start of some slight restructuring for the extractors
to re-use code and data.

Issue: #26

Co-Authored-By: Fabian Winter <5821180+fdw@users.noreply.github.com>
Co-Authored-By: Christophe-Marie Duquesne <chmd@chmd.fr>
  • Loading branch information
fdw and chmduquesne committed Mar 6, 2020
1 parent 89212a1 commit e366d09
Show file tree
Hide file tree
Showing 3 changed files with 2,827 additions and 0 deletions.
49 changes: 49 additions & 0 deletions extractors/extract_maths_symbols.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import html
from collections import namedtuple
from typing import List

import requests

from extractors.extractor import Extractor

Character = namedtuple('Character', 'char name')


class MathExtractor(Extractor):

def fetch_math_symbols(self: 'MathExtractor') -> List[Character]:
print('Downloading list of maths symbols...')

data = requests.get(
'https://unicode.org/Public/math/latest/MathClassEx-15.txt',
timeout=60
) # type: requests.Response

characters = []
for line in data.content.decode(data.encoding).split('\n'):
if line.startswith('#') or len(line) == 0:
continue

fields = line.split(';')

symbols = self.resolve_character_range(fields[0].strip())

for symbol in symbols:
characters.append(Character(symbol, self.names[symbol]))

return characters

def write_file(self: 'MathExtractor', symbols: List[Character]):
symbol_file = open(f"../picker/data/math.csv", 'w')

for character in symbols:
symbol_file.write(f"{character.char} {html.escape(character.name)}\n")

symbol_file.close()

def extract(self):
self.write_file(self.fetch_math_symbols())


if __name__ == "__main__":
MathExtractor().extract()
41 changes: 41 additions & 0 deletions extractors/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from typing import Dict, List

import requests


class Extractor(object):

def __init__(self):
self.names = self.__fetch_names()

def extract(self):
pass

def __fetch_names(self) -> Dict[chr, str]:
print("Fetching all names")

response = requests.get(
'https://unicode.org/Public/UNIDATA/UnicodeData.txt',
timeout=60
) # type: requests.Response

lines = response.content.decode(response.encoding).split('\n')
characters = {}

for line in lines:
if len(line) == 0:
continue
fields = line.split(';')
characters[chr(int(fields[0], 16))] = fields[1].title()

return characters

def resolve_character_range(self, line: str) -> List[chr]:
try:
(start, end) = line.split('..')
symbols = []
for char in range(int(start, 16), int(end, 16) + 1):
symbols.append(chr(char))
return symbols
except ValueError:
return [chr(int(line, 16))]
Loading

0 comments on commit e366d09

Please sign in to comment.