-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathner2tei.py
191 lines (161 loc) · 9.67 KB
/
ner2tei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# Developed by Elina Leblanc (University of Geneva)
import os
import xml.etree.ElementTree as eT
import csv
import re
xml_folder = "../Encodage/TEI_tests/tei_ner"
new_csvLine = [] # List to create a new CSV file
for file in os.listdir(xml_folder):
if file.endswith('.xml'):
xml_path = os.path.join(xml_folder, file) # We create a path to the files
# TEI Namespace declaration
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
eT.register_namespace('', 'http://www.tei-c.org/ns/1.0')
tree = eT.parse(xml_path) # We parse the TEI files
root = tree.getroot() # We get the root
id_doc = root.get("{http://www.w3.org/XML/1998/namespace}id") # We get the ID of the document
with open('../NER_Varios_enriched.csv', encoding='utf-8') as f:
csv_file = csv.reader(f) # We parse the CSV file
next(csv_file) # We skip the first line
loc_list_deduplicated = [] # List with original names (no duplicate)
loc_list_normalized = [] # List with normalised names (with duplicates)
loc_list_normalized_geo = [] # List with normalised names + geographical coordinates (with duplicates)
loc_list_normalized_wkd = [] # List with normalised names + wikidata id (with duplicates)
loc_list_occurrences = []
for line in csv_file: # We parse the csv file
if line[1].lower() == id_doc.lower(): # If the line has the same id as the TEI file
concat_coords = str(line[6]) + ',' + str(line[7])
loc_list_normalized.append(line[5]) # List with normalised names
loc_list_normalized_geo.append([line[5], concat_coords]) # Nested list with normalised names + coordinates
loc_list_normalized_wkd.append([line[5], line[3]]) # Nested list with normalised names + id Wkd
# We create a deduplicate list with the normalised names
if line[2] not in loc_list_deduplicated:
loc_list_deduplicated.append(line[2])
# print(loc_list_deduplicated)
'''# We get information in the TEI files
shortTitle = root.find('.//tei:titleStmt/tei:title', ns).text
shortTitle = re.sub(r"\n( +)", " ", shortTitle)
printer = root.find('.//tei:publisher', ns).text
pubPlace = root.find('.//tei:pubPlace', ns).text
typeText = root.find('.//tei:keywords/tei:term', ns).text
genre = root.find(".//tei:keywords//tei:term[@type='sagrado_profano']", ns).text
date = root.find('.//tei:date', ns)
# print(genre.text)
# We get the longitude and latitude
if line[5] != '':
new_coords = line[5].split(',')
# print(new_coords[0])
long = new_coords[0]
lat = new_coords[1]
# We add the TEI information at the end of each line
line.append(long)
line.append(lat)
line.append(shortTitle)
line.append(pubPlace)
line.append(printer)
if re.match("184[0-9]", date.text):
line.append('1840-1849')
elif re.match("185[0-9]", date.text):
line.append('1850-1859')
elif re.match("186[0-9]", date.text):
line.append('1860-1869')
elif re.match("187[0-9]", date.text):
line.append('1870-1879')
elif re.match("188[0-9]", date.text):
line.append('1880-1889')
else:
line.append('[s.a]')
line.append(typeText)
line.append(genre)
line.append('https://desenrollandoelcordel.unige.ch/Pliegos/' + file)
new_csvLine.append(line)
# print(line)'''
# Deduplicated list for geographical coordinates
total_geo = {}
for k, v in loc_list_normalized_geo:
total_geo[k] = str(total_geo.get(k, v))
loc_list_geo_deduplicated = [list(t) for t in total_geo.items()]
# print(loc_list_geo_deduplicated)
# Deduplicated list for WKD id
total_wkd = {}
for p, w in loc_list_normalized_wkd:
total_wkd[p] = str(total_wkd.get(p, w))
loc_list_wkd_deduplicated = [list(e) for e in total_wkd.items()]
# print(loc_list_wkd_deduplicated)
# We add a <name> element inside the body of the TEI file
# Code for prose documents - Uncomment to use it
'''for parag in root.findall(".//tei:text//tei:p/*", ns):
parag_text = parag.tail
# print(parag_text)
if parag_text is not None:
for i in range(len(loc_list_deduplicated)):
# On vérifie que le nom de lieu est dans le texte et on l'encode avec <name>
if loc_list_deduplicated[i] in parag_text:
new_line = parag_text.replace(loc_list_deduplicated[i], '<name>' + loc_list_deduplicated[i] + '</name>')
parag.tail = new_line
# print(eT.dump(parag))'''
# Code for verse documents
for l in root.findall(".//tei:l", ns):
if l.text is not None:
# print(id_doc, l.text, '->', type(l))
for i in range(len(loc_list_deduplicated)):
# On vérifie que le nom de lieu est dans le texte et on l'encode avec <name>
if loc_list_deduplicated[i] in l.text and l.text is not None:
new_line = l.text.replace(loc_list_deduplicated[i], '<name>' + loc_list_deduplicated[i] + '</name>')
l.text = new_line
# We add information about the edition
fileDesc = root.find(".//tei:fileDesc", ns)
editionStmt = eT.Element('editionStmt')
fileDesc.insert(1, editionStmt)
edition = eT.SubElement(editionStmt, "edition")
edition.set("n", "2.0")
title = eT.SubElement(edition, "title")
title.text = "Segunda versión de la edición, enriquecida con topónimos"
date = eT.SubElement(edition, "date")
date.text = "2024"
respStmt1 = eT.SubElement(editionStmt, "respStmt")
name1 = eT.SubElement(respStmt1, "name")
name1.text = "Elina Leblanc, Zoé Noël y Pauline Jacsont"
resp1 = eT.SubElement(respStmt1, "resp")
resp1.text = "Creación del modelo de reconocimiento automático de topónimos y codificación TEI"
respStmt2 = eT.SubElement(editionStmt, "respStmt")
name2 = eT.SubElement(respStmt2, "name")
name2.text = "Belinda Palacios, Lorysmar Franco y Constance Carta"
resp2 = eT.SubElement(respStmt2, "resp")
resp2.text = "Corrección de los resultados"
# We create a list of place in the <teiHeader>
sourceDesc = root.find('.//tei:sourceDesc', ns)
listPlace = eT.SubElement(sourceDesc, 'listPlace') # We add <listPlace> in <sourceDesc>
# We add an @xml:base attribute with URL of Wikidata
listPlace.set("{http://www.w3.org/XML/1998/namespace}base", "https://www.wikidata.org/wiki/")
for place in loc_list_geo_deduplicated:
if place[0] != "":
placeElement = eT.SubElement(listPlace, "place") # We add <place> in <listPlace>
placeNb = loc_list_normalized.count(place[0]) # We count the number of time a name appears in the text
placeElement.set("n", str(placeNb)) # @n with the numbre of occurrences
for i in range(len(loc_list_wkd_deduplicated)):
if loc_list_wkd_deduplicated[i][0] == place[0]:
placeElement.set("corresp", loc_list_wkd_deduplicated[i][1]) # @corresp with id Wkd
placeName = eT.SubElement(placeElement, "placeName") # We add the element <placeName>
placeName.text = place[0].title() # We add the name of place
placeName.set("key", place[0].replace(" ", "_")) # @key with the name of the place
location = eT.SubElement(placeElement, "location") # We add a <location> element
geo = eT.SubElement(location, "geo") # We add a <geo> element
geo.text = place[1].replace(",", " ") # We add the geographical coordinates
# Print the XML tree in the console
eT.indent(tree, space=" ", level=0)
# eT.dump(root)
# print('\n')
# We modify the TEI files
tree.write(xml_path, encoding="UTF-8", xml_declaration=True)
print(file + ": DONE.")
# Creation of a new CSV file with the TEI information
# List with the name of the columns
new_csv_header = ['index', 'id_doc', 'original_name', 'id_wkd', 'type_place', 'coord', 'normalized_name',
'longitude', 'latitude', 'shortTitle', 'pubPlace', 'printer', "date", "type_text", 'genre', 'url']
# print(new_csvLine)
with open('../Encodage/nerList_v3.csv', 'w', encoding='utf-8', newline='') as newCsv:
writer = csv.writer(newCsv)
writer.writerow(new_csv_header) # 1st line with the name of columns
writer.writerows(new_csvLine) # We add the new lines
newCsv.close()