-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcsv_to_grammar.py
executable file
·281 lines (242 loc) · 7.76 KB
/
csv_to_grammar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#! /usr/bin/env python
# Creates a GF grammar on the basis of a CSV file.
# The idea is to manage multilingual vocabularies in a spreadsheet
# (collaboratively) and automatically convert it into GF.
#
# Rows:
# 1. Languge names
# 2. Modules headers (interpreted as Python templates, with slots 'name', 'lang')
# 3-n. data
#
# Columns:
# 1. English word (lemma form), should be reasonably long and unambiguous
# 2. GF (RGL) category
# 3. lin operation or simple string in language 1
# 4. lin operation or simple string in language 2
# ...
#
# The grammar name is the name of the input file.
# The language names must be provided in the first row.
# Author: Kaarel Kaljurand
# Version: 2013-03-01
#
# Examples:
#
# ./csv_to_grammar.py --file Sheet1.csv --name Geograpy --dir outdir
#
# TODO:
# - catch some errors, e.g. wrong parentheses structure
# - add: --url <url of CSV-formatted data>
#
import sys
import argparse
import os
import re
import csv
from string import Template
#path_directive = "--# -path=.:present\n"
path_directive = ""
first_lang_col = 3
fun_col = 0
cat_col = 1
tags_col = 2
def write_file(dir, filename, content):
"""
"""
path = os.path.join(dir, filename)
print >> sys.stderr, 'Creating: ' + path
f = open(path, 'w')
f.write(content)
f.close()
def make_grammar_name(filename):
"""
TODO: better regexp for generating legal GF grammar names
"""
nodir = re.sub(r'.*\/', '', filename)
noext = re.sub(r'\..*', '', nodir)
return re.sub(r'[^A-Za-z0-9]', '_', noext)
def make_fun_name(word, cat):
word = strip_cell(word)
if word == "":
raise Exception("empty function name")
word = unicode_to_gfcode(word)
return word + "_" + cat
def make_cat(cat, default_cat):
cat = strip_cell(cat)
if cat == "":
return default_cat
return cat
def strip_cell(cell):
"""
Remove [comment text], normalize whitespace, remove padding space
"""
cell = re.sub(r'\[[^]]*\]', '', cell)
cell = re.sub(r'\s+', ' ', cell)
cell = cell.strip()
return cell
def make_lin(cell, cat, col_id, cell0):
"""
If the lin cell contains a bare string (e.g. '"capital" feminine')
i.e. no operator call (e.g. 'mkV2 "ask"', 'mkV2 L.ask_V'),
then create a call to a smart paradigm.
The whitespace is trimmed.
If there are not spaces then put the string into quotes.
TODO: rewrite the ACE-specific code in a general way
"""
is_ace_col = (col_id == first_lang_col)
cell = strip_cell(cell)
# If the cell is empty then we return None,
# unless we are in a the ACE-column in which case
# we'll try to use the function name as the ACE entry.
if cell == "":
if is_ace_col and cell0 != "":
cell = cell0
else:
return None
# if there are no existing quotes
# and it is not an entry like 'mkV2 (I.contener_V)'
if cell.find('"') == -1 and not re.search('[A-Z]\.', cell):
cell = '"' + cell + '"'
cell = underscore_strings(cell)
if cat == "CN":
if not has_prefix_some(cell, ['mkCN', 'aceN']):
if is_ace_col:
return 'aceN {0}'.format(cell)
return 'mkCN (mkN {0})'.format(cell)
elif cat == "V2" or cat == "V2by":
if not has_prefix_some(cell, ['mkV2', 'prepV2', 'aceV2']):
if is_ace_col:
return 'aceV2 {0}'.format(cell)
return 'mkV2 (mkV {0})'.format(cell)
else:
if not has_prefix_some(cell, ['mk' + cat, 'ace' + cat]):
if is_ace_col:
return 'ace{0} {1}'.format(cat, cell)
return 'mk{0} {1}'.format(cat, cell)
return cell
def has_prefix_some(s, prefix_set):
"""
True if the given string has a prefix
that is in the given set.
"""
for prefix in prefix_set:
if s.find(prefix, 0) != -1:
return True
return False
def underscore_strings(entry):
"""
Assumes that the input contains a sequence of quoted strings.
Replaces spaces inside strings with underscores.
"""
chars = ""
in_string = False
for ch in entry:
if ch == '"':
in_string = not in_string
elif ch == ' ' and in_string:
ch = '_'
chars += ch
if in_string:
raise Exception("unfinished string")
return chars
def unicode_to_gfcode(u):
"""
"""
u1 = u.decode("utf8")
u2 = u1.encode('ascii', 'xmlcharrefreplace')
u3 = re.sub(r'[^A-Za-z0-9\']', '_', u2)
return u3
def make_name2(u):
"""
Remove all whitespace and lowercase the result.
"""
return re.sub(r'\s+', '', u).lower()
def tags(string):
"""
Convert comma-separated strings to a set of strings
"""
return set([x.strip() for x in string.split(',')])
# Commandline arguments parsing
parser = argparse.ArgumentParser(description='Generates 2 GF modules for a given language')
parser.add_argument('-f', '--file', type=str, action='store', dest='csv_file',
help='name of the CSV file (OBLIGATORY)')
parser.add_argument('-n', '--name', type=str, action='store', dest='name',
help='name of the grammar, e.g. Phrasebook (OBLIGATORY)')
parser.add_argument('--exclude', type=tags, action='store', dest='exclude_tags',
default=set([]),
help='exclude each row tagged with one of these (comma-separated) tags')
parser.add_argument('-d', '--dir', type=str, action='store', dest='dir',
default='.',
help='output directory')
parser.add_argument('-v', '--version', action='version', version='%(prog)s v0.1')
args = parser.parse_args()
if args.csv_file is None:
print >> sys.stderr, 'ERROR: argument -f/--file is not specified'
exit()
if args.name is None:
args.name = make_grammar_name(args.csv_file)
funs = {}
funs2 = {}
lins = {}
header = []
with open(args.csv_file, 'rb') as csvfile:
#dialect = csv.Sniffer().sniff(csvfile.read(1024))
#csvfile.seek(0)
#reader = csv.reader(csvfile, dialect)
# TODO: we assume Google Drive's CSV conventions
# as sniffing didn't seem to get these right.
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
header = next(reader)
module_header = next(reader)
for row in reader:
try:
if len(row) <= first_lang_col:
raise Exception("there should be at least " + first_lang_col + " fields")
tagset = tags(row[tags_col])
intersection = tagset.intersection(args.exclude_tags)
if len(intersection) > 0:
raise Exception("ignoring row based on tags " + str(intersection))
cat = make_cat(row[cat_col], "PN")
funname = make_fun_name(row[fun_col], cat)
funname2 = make_name2(funname)
if funname in funs:
raise Exception("duplicate function name: '" + funname + "'")
if funname2 in funs2:
raise Exception("similar function name: '" + funname + "'")
funs[funname] = cat
funs2[funname2] = cat
i = first_lang_col
for cell in row[first_lang_col:]:
if i not in lins:
lins[i] = {}
lin = make_lin(cell, cat, i, strip_cell(row[fun_col]))
if lin != None:
lins[i][funname] = lin
i = i + 1
print >> sys.stderr, 'Reading: ' + ' | '.join(row)
except Exception as e:
print >> sys.stderr, 'Error: {:}: {:}'.format(e.message, ' | '.join(row))
# Put the abstract syntax into a string
abstract = path_directive
abstract += Template(module_header[1]).substitute(name = args.name) + " {\nfun\n"
for funname in sorted(funs, key=str.lower):
abstract = abstract + funname + " : " + funs[funname] + " ;\n"
abstract = abstract + "}"
# ... and write it into a file.
write_file(args.dir, args.name + ".gf", abstract)
# Put each concrete syntax into a string
for l in lins:
try:
lang_name = strip_cell(header[l])
if lang_name == "":
raise Exception("bad language name: '" + header[l] + "'")
concrete = path_directive
concrete += Template(module_header[l]).substitute(name = args.name, lang = lang_name) + " {\n"
concrete += "flags coding=utf8 ;\nlin\n"
for funname in sorted(lins[l], key=str.lower):
concrete = concrete + funname + " = " + lins[l][funname] + " ;\n"
concrete = concrete + "}"
# ... and write it into a file.
write_file(args.dir, args.name + lang_name + ".gf", concrete)
except Exception as e:
print >> sys.stderr, 'Error: {:}'.format(e.message)