-
Notifications
You must be signed in to change notification settings - Fork 10
/
05-make-ids.py
144 lines (124 loc) · 5.78 KB
/
05-make-ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
Replaces the ids of the facts by YAGO ids
CC-BY 2022 Fabian M. Suchanek
Input:
- 04-yago-facts-to-rename.tsv
- 04-yago-ids.tsv
- 04-yago-bad-classes.tsv
Output:
- 05-yago-final-wikipedia.tsv
- 05-yago-final-beyond-wikipedia.tsv
- 05-yago-final-meta.tsv
- 05-yago-final-taxonomy.tsv
Algorithm:
- load yago-ids.tsv
- run through yago-facts-to-rename.tsv
- replace the Wikidata ids by YAGO ids
- write out the facts to the output files
"""
TEST=False
FOLDER="test-data/05-make-ids/" if TEST else "yago-data/"
##########################################################################
# Booting
##########################################################################
import sys
import evaluator
import TsvUtils
##########################################################################
# Helper methods
##########################################################################
def isLiteral(entity):
""" TRUE for literals and external URLs """
return entity.startswith('"') or entity.startswith('<http://') or entity.startswith('<https://')
def isGeneric(entity):
""" TRUE for generic instances """
return entity.startswith('_:')
def toYagoEntity(entity):
""" Translates an entity to a YAGO entity, passes through literals, returns NONE otherwise """
if entity.startswith('"'):
return entity
if entity.startswith('<http://') or entity.startswith('<https://'):
return entity
if entity.startswith("yago:") or entity.startswith("schema:") or entity.startswith("rdfs:") :
return entity
if entity.startswith("_:"):
# Anonymous members of lists etc.
if not entity.endswith("_generic_instance"):
return entity
# Generic instances
cls=entity[2:-17]
cls=yagoIds.get(cls, None)
if cls==None or cls.find(":")==-1:
return None
return cls+"_generic_instance"
if entity in yagoIds:
return yagoIds[entity]
return None
def goesToWikipediaVersion(entity):
""" TRUE if the entity is a literal or has a Wikipedia page or is a generic instance"""
return isLiteral(entity) or entity in entitiesWithWikipediaPage or entity.endswith("_generic_instance")
##########################################################################
# Main
##########################################################################
with TsvUtils.Timer("Step 05: Renaming YAGO entities"):
yagoIds={}
entitiesWithWikipediaPage=set()
for split in TsvUtils.tsvTuples(FOLDER+"04-yago-ids.tsv", " Loading YAGO ids"):
if len(split)<4:
continue
yagoIds[split[0]]=split[2]
if split[3]==". #WIKI":
entitiesWithWikipediaPage.add(split[2])
for split in TsvUtils.tsvTuples(FOLDER+"04-yago-bad-classes.tsv", " Removing bad YAGO classes"):
yagoIds.pop(split[0], None)
with TsvUtils.TsvFileWriter(FOLDER+"05-yago-final-meta.tsv") as metaFacts:
with TsvUtils.TsvFileWriter(FOLDER+"05-yago-final-beyond-wikipedia.tsv") as fullFacts:
with TsvUtils.TsvFileWriter(FOLDER+"05-yago-final-wikipedia.tsv") as wikipediaFacts:
previousEntity="Elvis"
for split in TsvUtils.tsvTuples(FOLDER+"04-yago-facts-to-rename.tsv", " Renaming"):
if len(split)<3:
continue
subject=toYagoEntity(split[0])
if not subject:
# Should not happen
continue
relation=split[1]
object=toYagoEntity(split[2])
if not object:
# Should not happen
continue
# Write facts to Wikipedia version of YAGO
if goesToWikipediaVersion(subject) and (relation=="rdf:type" or goesToWikipediaVersion(object)):
wikipediaFacts.writeFact(subject, relation, object)
if subject!=previousEntity and split[0] in yagoIds:
wikipediaFacts.writeFact(subject, "owl:sameAs", split[0])
else:
fullFacts.writeFact(subject, relation, object)
if subject!=previousEntity and split[0] in yagoIds:
fullFacts.writeFact(subject, "owl:sameAs", split[0])
# If there is a meta-fact, write it out as well
if len(split)>5:
if split[4]: metaFacts.write("<<", subject, relation, object, ">>", "schema:startDate", split[4])
if split[5]: metaFacts.write("<<", subject, relation, object, ">>", "schema:endDate", split[5])
if not subject.endswith("_generic_instance"):
previousEntity=subject
with TsvUtils.TsvFileWriter(FOLDER+"05-yago-final-taxonomy.tsv") as taxFacts:
for split in TsvUtils.tsvTuples(FOLDER+"02-yago-taxonomy-to-rename.tsv", " Renaming classes"):
if len(split)<3:
continue
subject=toYagoEntity(split[0])
if not subject:
# Happens if a class has no label or no instances
continue
relation=split[1]
object=split[2] if relation=="rdf:type" else toYagoEntity(split[2])
if not object:
# Happens if a class has no label or no instances
continue
# Write taxonomic fact
taxFacts.writeFact(subject, relation, object)
if TEST:
evaluator.compare(FOLDER+"05-yago-final-wikipedia.tsv")
evaluator.compare(FOLDER+"05-yago-final-beyond-wikipedia.tsv")
evaluator.compare(FOLDER+"05-yago-final-meta.tsv")
evaluator.compare(FOLDER+"05-yago-final-taxonomy.tsv")