-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This is also the start of some slight restructuring for the extractors to re-use code and data. Issue: #26 Co-Authored-By: Fabian Winter <5821180+fdw@users.noreply.github.com> Co-Authored-By: Christophe-Marie Duquesne <chmd@chmd.fr>
- Loading branch information
1 parent
89212a1
commit e366d09
Showing
3 changed files
with
2,827 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import html | ||
from collections import namedtuple | ||
from typing import List | ||
|
||
import requests | ||
|
||
from extractors.extractor import Extractor | ||
|
||
Character = namedtuple('Character', 'char name') | ||
|
||
|
||
class MathExtractor(Extractor): | ||
|
||
def fetch_math_symbols(self: 'MathExtractor') -> List[Character]: | ||
print('Downloading list of maths symbols...') | ||
|
||
data = requests.get( | ||
'https://unicode.org/Public/math/latest/MathClassEx-15.txt', | ||
timeout=60 | ||
) # type: requests.Response | ||
|
||
characters = [] | ||
for line in data.content.decode(data.encoding).split('\n'): | ||
if line.startswith('#') or len(line) == 0: | ||
continue | ||
|
||
fields = line.split(';') | ||
|
||
symbols = self.resolve_character_range(fields[0].strip()) | ||
|
||
for symbol in symbols: | ||
characters.append(Character(symbol, self.names[symbol])) | ||
|
||
return characters | ||
|
||
def write_file(self: 'MathExtractor', symbols: List[Character]): | ||
symbol_file = open(f"../picker/data/math.csv", 'w') | ||
|
||
for character in symbols: | ||
symbol_file.write(f"{character.char} {html.escape(character.name)}\n") | ||
|
||
symbol_file.close() | ||
|
||
def extract(self): | ||
self.write_file(self.fetch_math_symbols()) | ||
|
||
|
||
if __name__ == "__main__": | ||
MathExtractor().extract() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from typing import Dict, List | ||
|
||
import requests | ||
|
||
|
||
class Extractor(object): | ||
|
||
def __init__(self): | ||
self.names = self.__fetch_names() | ||
|
||
def extract(self): | ||
pass | ||
|
||
def __fetch_names(self) -> Dict[chr, str]: | ||
print("Fetching all names") | ||
|
||
response = requests.get( | ||
'https://unicode.org/Public/UNIDATA/UnicodeData.txt', | ||
timeout=60 | ||
) # type: requests.Response | ||
|
||
lines = response.content.decode(response.encoding).split('\n') | ||
characters = {} | ||
|
||
for line in lines: | ||
if len(line) == 0: | ||
continue | ||
fields = line.split(';') | ||
characters[chr(int(fields[0], 16))] = fields[1].title() | ||
|
||
return characters | ||
|
||
def resolve_character_range(self, line: str) -> List[chr]: | ||
try: | ||
(start, end) = line.split('..') | ||
symbols = [] | ||
for char in range(int(start, 16), int(end, 16) + 1): | ||
symbols.append(chr(char)) | ||
return symbols | ||
except ValueError: | ||
return [chr(int(line, 16))] |
Oops, something went wrong.