-
Notifications
You must be signed in to change notification settings - Fork 4
/
convert-metadix-dix.py
124 lines (111 loc) · 3.19 KB
/
convert-metadix-dix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
#
# Copyright (C) 2020 Jaume Orotlà <jaume.ortola@gmail.com>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
#
import sys, re
import xml.etree.ElementTree as ET
def isMultiword(e):
for part in e:
if part.tag == "i":
b = part.find("b")
if b:
return True
p = e.find("p")
if p is not None:
l=p.find("l")
if l is not None:
g = l.find("g")
if g is not None:
return True
b = l.find("b")
if b is not None:
return True
r=p.find("r")
if r is not None:
g = r.find("g")
if g is not None:
return True
b = r.find("b")
if b is not None:
return True
return False
def wordL(e):
word = None
for part in e:
if part.tag == "i":
word = part.text
if word is None:
p = e.find("p")
if p is not None:
l = p.find("l")
word = l.text
if word is None:
word = ""
return word
def wordR(e):
word = None
for part in e:
if part.tag == "i":
word = part.text
if word is None:
p = e.find("p")
if p is not None:
l = p.find("r")
word = l.text
if word is None:
word = ""
return word
source = sys.argv[1]
target = sys.argv[2]
tree = ET.ElementTree()
tree.parse(source)
pardefs = tree.find('pardefs')
prefixes = {}
for pardef in pardefs.iter(tag='pardef'):
namepardef = pardef.get("n")
if namepardef.startswith("prefixes_"):
grammarclass = re.sub ("prefixes_([^_]+).*", "\\1", namepardef)
prefixes[grammarclass]=namepardef
mainsection = tree.find('.//section[@id="main"]')
for e in mainsection.iter(tag='e'):
parname = ""
par = None
p = e.find('p')
if p is not None:
par = p.find('l').find('s')
if par is None:
i = e.find('i')
if i is not None:
par = i.find('s')
if par is None:
continue
if isMultiword(e):
continue
wordStrL = wordL(e)
wordStrR = wordR(e)
if '<b' in wordStrL or '_' in wordStrL or '<b' in wordStrR or '_' in wordStrR or '.' in wordStrL or '.' in wordStrR:
continue
if len(wordStrL)<4 and len(wordStrR)<4:
continue
if len(wordStrL)<3 or len(wordStrR)<3:
continue
parname = par.get("n")
for prefix in prefixes.keys():
if parname == prefix:
new = ET.Element('par')
new.set('n', prefixes[prefix])
e.insert(0, new)
tree.write(target)