-
Notifications
You must be signed in to change notification settings - Fork 4
/
eagle-hisp.py
123 lines (97 loc) · 3.34 KB
/
eagle-hisp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""
Accepted parameters:
-dry
Dry run: don't edit the wiki, but process and print all the data.
It's useful together with -always to check for crashes in the script before launching the bot.
-always
Don't ask for confirmation before submitting a new item to the wiki.
-start:<filename>
Start from file "<filename>". Useful for resuming an interrupted import.
"""
import pywikibot, os, re
import xml.etree.ElementTree as ET
DATA_DIR = 'EAGLE-data/Hispania epigrafica/'
def main():
always = dryrun = startsWith = False
# Handles command-line arguments for pywikibot.
for arg in pywikibot.handleArgs():
if arg == '-dry': # Performs a dry run (does not edit site)
dryrun = True
if arg == '-always': # Does not ask for confirmation
always = True
if arg.startswith('-start:'): # Example: -start:100
startsWith = arg.replace('-start:', '')
if not dryrun:
# pywikibot/families/eagle_family.py
site = pywikibot.Site('en', 'eagle').data_repository()
# Numeric sorting for file names ("32.xml" must follow "4.xml"...)
for fileName in sorted(os.listdir(DATA_DIR), key=idFromFilename):
if startsWith:
if fileName != (startsWith + '.xml'):
continue # Skips files until start
elif fileName == (startsWith + '.xml'):
startsWith = False # Resets
tree = ET.parse(DATA_DIR + fileName)
root = tree.getroot()
# HispaniaEpigrafica ID + label
hep = fileName[0:-4] # Remove extension (.xml)
label = 'HEp ' + hep
pywikibot.output("\n>>>>> " + label + " <<<<<\n")
pywikibot.output('HEp ID: ' + hep)
# Title
title = elementText(root.findall('./title')[0])
# IPR
ipr = elementText(root.findall('./license')[0])[1:-1] # Strip quotes
pywikibot.output('IPR: ' + ipr)
# ES Translation
esTranslation = elementText(root.findall('./text')[0])
pywikibot.output('ES Translation: ' + esTranslation)
# Author
author = elementText(root.findall('./translator')[0])
if not author:
pywikibot.output('WARNING: no author!')
else:
pywikibot.output('Author: ' + author)
pywikibot.output('') # newline
if not always:
choice = pywikibot.inputChoice(u"Proceed?", ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
else:
choice = 'y'
if choice in ['A', 'a']:
always = True
choice = 'y'
if not dryrun and choice in ['Y', 'y']:
page = pywikibot.ItemPage(site)
page.editEntity({'labels':{'en': label, 'es': label}, 'descriptions':{'es': title}})
page.get()
addClaimToItem(site, page, 'P22', hep)
addClaimToItem(site, page, 'P25', ipr)
transClaim = pywikibot.Claim(site, 'P14')
transClaim.setTarget(esTranslation)
page.addClaim(transClaim)
if author:
authorClaim = pywikibot.Claim(site, 'P21')
authorClaim.setTarget(author)
transClaim.addSource(authorClaim)
def idFromFilename(filename):
"""Extracts the number from filename:
"3.xml" --> 3
"""
return int(filename.split('.')[0])
def addClaimToItem(site, page, id, value):
"""Adds a claim to an ItemPage."""
claim = pywikibot.Claim(site, id)
claim.setTarget(value)
page.addClaim(claim)
def elementText(elem):
"""Get inner element text, stripping tags of sub-elements."""
text = ''.join(elem.itertext()).strip()
text = re.sub('\n', ' ', text)
text = re.sub('\s{2,}', ' ', text)
return text
if __name__ == "__main__":
try:
main()
finally:
pywikibot.stopme()