-
Notifications
You must be signed in to change notification settings - Fork 13
/
parse_wikitionary_pl.py
132 lines (106 loc) · 4.23 KB
/
parse_wikitionary_pl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import sys
import re
import time
import sqlite3
from zipfile import ZipFile, ZIP_DEFLATED
from xml.etree import ElementTree as ET
from wikitextparser import remove_markup, parse
"""
Downloading latest dump file :
wget https://dumps.wikimedia.org/plwiktionary/latest/plwiktionary-latest-pages-articles.xml.bz2
Parsing structure
znaczenia (meaning start marker)
rzeczownik (part of speech)
(1.1) poet. wieczór
(1.2) vespers: rel. nieszpory
(1.3) rel. dzwon wzywający na nieszpory
odmiana (meaning end marker)
"""
SQL_CREATE_TABLE = """
CREATE TABLE IF NOT EXISTS dictionary
(
id INTEGER PRIMARY KEY,
word TEXT,
lexical_category TEXT,
etymology_no INTEGER,
definition_no INTEGER,
definition TEXT
);
"""
SQL_DELETE_ENTRIES = "DELETE from dictionary"
DATABASE_FILE = "dictionary_pl.db"
def main():
connection = sqlite3.connect(DATABASE_FILE)
cursor = connection.cursor()
cursor.execute(SQL_CREATE_TABLE)
cursor.execute(SQL_DELETE_ENTRIES)
start_time = time.time()
doc = ET.iterparse(sys.argv[1])
index = 0
words = 0
count = 0
for event, elem in doc:
if "page" in elem.tag:
title = elem.find(
".//{http://www.mediawiki.org/xml/export-0.10/}title"
).text
content = (
elem.find(".//{http://www.mediawiki.org/xml/export-0.10/}revision")
.find(".//{http://www.mediawiki.org/xml/export-0.10/}text")
.text
)
try:
sections = parse(content).sections
for section in sections:
if section.templates:
start, end = None, None
for template in section.templates:
if template.name == "odmiana":
end = template
if template.name == "znaczenia":
start = template
if start and end:
content = parse(
section.string[
start.span[0]
- section.span[0] : end.span[1]
- section.span[0]
]
)
part_of_speech = content.get_italics()[0].text
meanings = content.lists()[0].items
for meaning_ in meanings:
index += 1
meaning = remove_markup(meaning_).strip()
meaning = re.sub(
"^\s*\(\s*\d+\s*\.\s*\d+\s*\)", "", meaning
)
cursor.execute(
"INSERT INTO dictionary VALUES (?, ?, ?, ?, ?, ?)",
(index, title, part_of_speech, 1, 1, meaning),
)
print(index, title, part_of_speech, 1, 1, meaning)
except (Exception, IndexError) as e:
elem.clear()
print(e)
continue
if count > 1000:
count = 0
cursor.execute("COMMIT")
connection.commit()
cursor.execute("BEGIN TRANSACTION")
print(
f"Processing {words} words and {index} meanings took"
f" {time.time()-start_time} seconds"
)
# https://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory
elem.clear()
words += 1
count += 1
cursor.close()
connection.close()
print(f"Processing {words} words took {time.time()-start_time} seconds")
with ZipFile(f"{DATABASE_FILE}.zip", "w", ZIP_DEFLATED) as zipf:
zipf.write(DATABASE_FILE)
if __name__ == "__main__":
main()