forked from LBeaudoux/iso639
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate.py
483 lines (428 loc) · 13.6 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
import json
import pickle
import sqlite3
from collections import namedtuple
from iso639.datafile import get_file
from iso639.iso639 import Lang
Iso6392 = namedtuple(
"Iso6392",
[
"alpha3_bibliographic",
"alpha3_terminologic",
"alpha2",
"English_name",
"French_name",
],
)
def read_iso6392(datafile: str) -> Iso6392:
"""Read ISO 639-2 data from its source file"""
with open(datafile, encoding="utf-8-sig") as f:
for line in f:
row = line.rstrip().split("|")
yield Iso6392(*row)
def load_iso6392(datafile: str, db: sqlite3.Connection):
"""Load ISO 639-2 data into a database table"""
with db:
db.execute(
"""
CREATE TEMPORARY TABLE iso6392 (
pt2b CHAR(3) PRIMARY KEY,
name TEXT NOT NULL,
pt2t CHAR(3) NULL,
pt1 CHAR(2) NULL
)
"""
)
for iso6392 in read_iso6392(datafile):
db.execute(
"INSERT INTO iso6392 VALUES (?, ?, ?, ?)",
(
iso6392.alpha3_bibliographic,
iso6392.English_name,
iso6392.alpha3_terminologic,
iso6392.alpha2,
),
)
Iso6393 = namedtuple(
"Iso6393",
[
"Id",
"Part2B",
"Part2T",
"Part1",
"Scope",
"Language_Type",
"Ref_Name",
],
)
def read_iso6393(datafile: str) -> Iso6393:
"""Read ISO 639-3 data from its source file"""
with open(datafile, encoding="utf-8") as f:
next(f)
for line in f:
row = line.rstrip().split("\t")
yield Iso6393(*row[:7])
def load_iso6393(datafile: str, db: sqlite3.Connection):
"""Load ISO 639-3 data into a database table"""
with db:
db.execute(
"""
CREATE TEMPORARY TABLE iso6393 (
pt3 CHAR(3) PRIMARY KEY,
name TEXT NOT NULL,
pt1 CHAR(2) NULL,
pt2b CHAR(3) NULL,
pt2t CHAR(3) NULL,
scope CHAR(1) NOT NULL,
type CHAR(1) NOT NULL
)
"""
)
for iso6393 in read_iso6393(datafile):
db.execute(
"INSERT INTO iso6393 VALUES (?, ?, ?, ?, ?, ?, ?)",
(
iso6393.Id,
iso6393.Ref_Name,
iso6393.Part1,
iso6393.Part2B,
iso6393.Part2T,
iso6393.Scope,
iso6393.Language_Type,
),
)
Iso6395 = namedtuple(
"Iso6395", ["URI", "code", "Label_English", "Label_French"]
)
def read_iso6395(datafile: str) -> Iso6395:
"""Read ISO 639-5 data from its source file"""
with open(datafile, encoding="utf-8") as f:
next(f)
for line in f:
row = line.rstrip().split("\t")
yield Iso6395(*row)
def load_iso6395(datafile: str, db: sqlite3.Connection):
"""Load ISO 639-5 data into a database table"""
with db:
db.execute(
"""
CREATE TEMPORARY TABLE iso6395 (
pt5 CHAR(3) PRIMARY KEY,
name TEXT NOT NULL
)
"""
)
for iso6395 in read_iso6395(datafile):
db.execute(
"INSERT INTO iso6395 (pt5, name) VALUES (?, ?)",
(iso6395.code, iso6395.Label_English),
)
def build_iso639(db: sqlite3.Connection):
"""Merge ISO 639-1, ISO 639-2, ISO 639-3 and ISO 639-5
data into a single table
"""
with db:
db.execute("DROP TABLE IF EXISTS iso639")
db.execute(
"""
CREATE TABLE iso639 (
name TEXT PRIMARY KEY,
pt1 CHAR(2) NULL,
pt2b CHAR(3) NULL,
pt2t CHAR(3) NULL,
pt3 CHAR(3) NULL,
pt5 CHAR(3) NULL,
scope CHAR(1) NULL,
type CHAR(1) NULL
)
"""
)
db.execute(
"""
INSERT INTO iso639
SELECT
iso6393.name,
iso6393.pt1,
iso6393.pt2b,
iso6393.pt2t,
iso6393.pt3,
'',
iso6393.scope,
iso6393.type
FROM iso6393
"""
)
db.execute(
"""
INSERT INTO iso639
SELECT
iso6395.name,
IFNULL(iso6392.pt1, ''),
IFNULL(iso6392.pt2b, ''),
IFNULL(iso6392.pt2t, ''),
'',
iso6395.pt5,
'',
''
FROM iso6395
LEFT JOIN iso6392
ON iso6395.pt5 = iso6392.pt2b
"""
)
for pt in ("name", "pt1", "pt2b", "pt2t", "pt3", "pt5"):
db.execute(
"CREATE INDEX idx_iso639_{0} ON iso639 ({0})".format(pt)
)
Retirement = namedtuple(
"Retirement",
[
"Id",
"Ref_Name",
"Ret_Reason",
"Change_To",
"Ret_Remedy",
"Effective",
],
)
def read_retirement(datafile: str) -> Retirement:
"""Read ISO 639-3 retirement data from its source file"""
with open(datafile, encoding="utf-8") as f:
next(f)
for line in f:
row = line.rstrip().split("\t")
yield Retirement(*row)
def load_retirements(datafile: str, db: sqlite3.Connection):
"""Load retirements into a database table"""
with db:
db.execute(
"""
CREATE TEMPORARY TABLE temp_retirements (
pt3 CHAR(3) PRIMARY KEY,
name VARCHAR(150) NOT NULL,
reason CHAR(1) NOT NULL,
change_to CHAR(3) NULL,
ret_remedy VARCHAR(300) NULL,
effective DATE NOT NULL
)
"""
)
for ret in read_retirement(datafile):
db.execute(
"INSERT INTO temp_retirements VALUES (?, ?, ?, ?, ?, ?)",
(
ret.Id,
ret.Ref_Name,
ret.Ret_Reason,
ret.Change_To,
ret.Ret_Remedy,
ret.Effective,
),
)
def filter_retirements(db: sqlite3.Connection):
"""Discard infinite retirement loops"""
with db:
db.execute("DROP TABLE IF EXISTS retirements")
db.execute(
"""
CREATE TABLE retirements (
pt3 CHAR(3) PRIMARY KEY,
name VARCHAR(150) NOT NULL,
reason CHAR(1) NOT NULL,
change_to CHAR(3) NULL,
ret_remedy VARCHAR(300) NULL,
effective DATE NOT NULL
)
"""
)
db.execute(
"""
INSERT INTO retirements
SELECT
rt1.pt3,
rt1.name,
rt1.reason,
rt1.change_to,
rt1.ret_remedy,
rt1.effective
FROM temp_retirements as rt1
LEFT JOIN temp_retirements as rt2
ON rt1.change_to = rt2.pt3
WHERE rt2.change_to IS NULL
OR (
rt1.pt3 != rt1.change_to
AND rt1.effective > rt2.effective
)
"""
)
db.execute(
"""
CREATE INDEX idx_retirements_effective
ON retirements (effective)
"""
)
MacroLanguage = namedtuple("Macro", ["M_Id", "I_Id", "I_Status"])
def read_macro(datafile: str) -> MacroLanguage:
"""Read ISO 639-3 macrolanguage data from its source file"""
with open(datafile, encoding="utf-8") as f:
next(f)
for line in f:
row = line.rstrip().split("\t")
yield MacroLanguage(*row)
def load_macros(datafile: str, db: sqlite3.Connection):
"""Load ISO 639-3 macrolanguage data into a database table"""
with db:
db.execute("DROP TABLE IF EXISTS macros")
db.execute(
"""
CREATE TEMPORARY TABLE temp_macros (
macro CHAR(3) NOT NULL,
individual CHAR(3) NOT NULL,
individual_status CHAR(1) NOT NULL
)
"""
)
for macro in read_macro(datafile):
db.execute(
"INSERT INTO temp_macros VALUES (?, ?, ?)",
(macro.M_Id, macro.I_Id, macro.I_Status),
)
db.execute(
"""
CREATE TABLE macros (
macro CHAR(3) NOT NULL,
individual CHAR(3) NOT NULL
)
"""
)
# route deprecated individual languages
db.execute(
"""
INSERT INTO macros
SELECT
CASE
WHEN r1.change_to IS NULL OR r1.change_to = ''
THEN m.macro
ELSE r1.change_to
END AS macro,
CASE
WHEN r2.change_to IS NULL OR r2.change_to = ''
THEN m.individual
ELSE r2.change_to
END AS individual
FROM temp_macros AS m
LEFT JOIN retirements r1
ON m.macro = r1.pt3
LEFT JOIN retirements r2
ON m.individual = r2.pt3
"""
)
db.execute(
"""
CREATE INDEX idx_macros_macro_individual
ON macros (macro, individual)
"""
)
db.execute(
"""
CREATE INDEX idx_macro_individual
ON macros (individual)
"""
)
def serialize_iso639(db: sqlite3.Connection, datafile: str):
with db:
mapping = {}
for tag in ("name", "pt1", "pt2b", "pt2t", "pt3", "pt5"):
sql = """
SELECT name, pt1, pt2b, pt2t, pt3, pt5
FROM iso639
WHERE {0} != ''
ORDER BY {0}
""".format(
tag
)
for row in db.execute(sql):
dict_row = dict(row)
dict_row.pop(tag)
mapping.setdefault(tag, {})[row[tag]] = dict_row
with open(datafile, "w", encoding="utf-8") as f:
json.dump(mapping, f)
def serialize_scope(db: sqlite3.Connection, datafile: str):
with db:
sql = "SELECT pt3, scope FROM iso639 WHERE pt3 != '' ORDER BY pt3"
mapping = {row[0]: row[1] for row in db.execute(sql)}
with open(datafile, "w", encoding="utf-8") as f:
json.dump(mapping, f)
def serialize_type(db: sqlite3.Connection, datafile: str):
with db:
sql = "SELECT pt3, type FROM iso639 WHERE pt3 != '' ORDER BY pt3"
mapping = {row[0]: row[1] for row in db.execute(sql)}
with open(datafile, "w", encoding="utf-8") as f:
json.dump(mapping, f)
def serialize_deprecated(db: sqlite3.Connection, datafile: str):
with db:
mapping = {}
sql = "SELECT * FROM retirements ORDER BY pt3, effective"
for row in db.execute(sql):
dict_row = dict(row)
dict_row.pop("pt3")
mapping[row["pt3"]] = dict_row
with open(datafile, "w", encoding="utf-8") as f:
json.dump(mapping, f)
def serialize_macro(db: sqlite3.Connection, datafile: str):
with db:
mapping = {}
sql = "SELECT macro, individual FROM macros"
for macro, individual in db.execute(sql):
mapping.setdefault("macro", {}).setdefault(macro, []).append(
individual
)
mapping.setdefault("individual", {})[individual] = macro
with open(datafile, "w", encoding="utf-8") as f:
json.dump(mapping, f)
def serialize_langs(db: sqlite3.Connection, datafile: str):
Lang._reset()
with db:
sql = """
SELECT iso639.name
FROM iso639
LEFT JOIN retirements
ON iso639.pt3 = retirements.pt3
WHERE retirements.pt3 IS NULL
ORDER BY iso639.name
"""
all_langs = []
for (name,) in db.execute(sql):
all_langs.append(Lang(name))
with open(datafile, "wb") as f:
pickle.dump(all_langs, f)
if __name__ == "__main__":
con = sqlite3.connect(":memory:")
con.row_factory = sqlite3.Row
# load source files and transform tables
pt3_path = get_file("pt3")
load_iso6393(pt3_path, con)
pt5_path = get_file("pt5")
load_iso6395(pt5_path, con)
pt2_path = get_file("pt2")
load_iso6392(pt2_path, con)
build_iso639(con)
retirements_path = get_file("retirements")
load_retirements(retirements_path, con)
filter_retirements(con)
macros_path = get_file("macros")
load_macros(macros_path, con)
# export mapping results as JSON files
mapping_data_path = get_file("mapping_data")
serialize_iso639(con, mapping_data_path)
mapping_scope_path = get_file("mapping_scope")
serialize_scope(con, mapping_scope_path)
mapping_type_path = get_file("mapping_type")
serialize_type(con, mapping_type_path)
mapping_deprecated_path = get_file("mapping_deprecated")
serialize_deprecated(con, mapping_deprecated_path)
mapping_macro_path = get_file("mapping_macro")
serialize_macro(con, mapping_macro_path)
# pickle the list of all possible Lang instances
langs_path = get_file("list_langs")
serialize_langs(con, langs_path)
con.close()