-
Notifications
You must be signed in to change notification settings - Fork 0
/
tipafy_file
executable file
·91 lines (84 loc) · 2.9 KB
/
tipafy_file
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import eltk
from eltk.utils.CharConverter import *
from eltk.utils.functions import *
import codecs
import unicodedata
import re
open = codecs.open
tipa_converter=CharConverter('uni','tipa')
latex_converter=CharConverter('uni','latex')
other_converter = {"⇌":"$\\rightleftharpoons$"," ":" "}
def ipa_to_tipa(chars):
doc = tipa_converter.convert(chars)
#doc = eltk.utils.functions.tipaclean(doc)
if str(doc) != str(chars):
return u"\\textipa{" + doc.decode("utf-8") + u"}"
else:
doc = latex_converter.convert(doc)
for x in other_converter:
doc = doc.replace(x,other_converter[x])
if str(doc) == str(chars):
doc = ""
changed = []
for char in chars.decode("utf-8"):
if ord(char) > 256:
doc += unicodedata.normalize('NFKD',char).encode('ascii', 'ignore')
changed.append(unicodedata.name(char))
else:
doc += char
doc += u"\n% Warning ! Normalized characters: " + u", ".join(changed) + u"\n"
return doc
def char_in_extra_latin_range(char):
"""Test if the char is in any of the following blocks:
0080..00FF; Latin-1 Supplement <- from 128
0100..017F; Latin Extended-A
0180..024F; Latin Extended-B
0250..02AF; IPA Extensions
02B0..02FF; Spacing Modifier Letters
0300..036F; Combining Diacritical Marks <- to 879
2C60..2C7F; Latin Extended-C <- (11360, 11391)
A720..A7FF; Latin Extended-D <- (42784, 43007)
"""
ranges = [(128, 879), (11360, 11391), (42784, 43007)]
i = ord(char)
return any(x<=i<=y for x,y in ranges)
def is_phonetic_notation(token):
signalled_as_ipa = (len(token)>2 and token.startswith(u"/") and token.endswith(u"/"))
char_is_ipa = any(char_in_extra_latin_range(c) for c in token)
return signalled_as_ipa or char_is_ipa
try:
flow = open(sys.argv[1],"r","utf8")
except IndexError:
flow = sys.stdin
indocument = False
tipa = False
for line in flow:
line = line.strip(u"\n")
if not indocument:
if u"\\usepackage{tipa}" in line:
tipa = True
if u"\\begin{document}" in line:
if not tipa:
print "\\usepackage{tipa}"
indocument=True
print line.encode('utf-8')
else:
elts = re.split(r"(?<!\\)\%",line,maxsplit=1)
line = elts[0]
comments = ""
if len(elts)>1:
comments = "%"+elts[1]
newline = re.split(u'([^\\\{\}$ \[\]\(\)\<\>!.,;:-_?"%\']+)',line)
for i,token in enumerate(newline):
if is_phonetic_notation(token) :
newline[i] = ipa_to_tipa(token.encode("utf8"))
newline.append(comments)
try:
print u"".join(newline).encode('utf-8')
except:
print("ERROR:",newline)
raise
flow.close()