-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconsistency.py
204 lines (164 loc) · 7.58 KB
/
consistency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
Checks the consistency of BibTeX entries.
"""
import argparse
from aux import nanny
__author__ = 'Marc Schulder'
HEADLINE_PATTERN = "===== {} ====="
NOT_IMPLEMENTED_PATTERN = "# Warning for {} not yet implemented.\n"
class ConsistencyConfig(nanny.NannyConfig):
SECTION = 'Consistency'
FALLBACK_VALUE = True
def _getConfigValue(self, section, key, fallback=None):
value = section.getboolean(key, fallback=fallback)
# print(key, ':', value)
if value is None:
value = self.FALLBACK_VALUE
print('WARNING: Config contains no information for key "{}", value defaults to "{}"'.format(
key, self.FALLBACK_VALUE))
return value
def _getConfigList(self, section, key, separator=','):
value = section.get(key, fallback=None)
if value is None:
return []
else:
items = [item.strip() for item in value.split(separator)]
return items
def getEnumerationString(entries, quotes=None):
if len(entries) == 0:
return ''
if len(entries) == 1:
if quotes is None:
return entries[0].key
else:
return "{1}{0}{1}".format(entries[0].key, quotes)
else:
first_entry = entries[0]
last_entry = entries[-1]
remaining_entries = entries[1:-1]
elems = [first_entry.key]
for entry in remaining_entries:
elems.append(', ')
if quotes is not None:
elems.append(quotes)
elems.append(entry.key)
if quotes is not None:
elems.append(quotes)
elems.append(' and ')
if quotes is not None:
elems.append(quotes)
elems.append(last_entry.key)
if quotes is not None:
elems.append(quotes)
return ''.join(elems)
def checkConsistency(entries, config):
# Check for Duplicates #
# Duplicate keys
if config.duplicateKeys:
print(NOT_IMPLEMENTED_PATTERN.format("Duplicate Keys"))
# duplicateKeys = nanny.findDuplicateKeys(entries)
# if duplicateKeys:
# print(HEADLINE_PATTERN.format("Duplicate Keys"))
# for duplicateKey in duplicateKeys:
# print("Found duplicate key:".format(duplicateKey))
# print()
# Duplicate titles
# Todo: Add handling of acceptable cases, such as different editions of a book, preprints and talks.
if config.duplicateTitles:
title2duplicateEntries = nanny.findDuplicateTitles(entries, config.duplicateTitlesIgnoredTypes)
if title2duplicateEntries:
print(HEADLINE_PATTERN.format("Duplicate Titles"))
for duplicateTitle, duplicateTitleEntries in title2duplicateEntries.items():
keysString = getEnumerationString(duplicateTitleEntries)
firstTitle = duplicateTitleEntries[0][nanny.FIELD_TITLE]
print("Entries {} have the same title: {}".format(keysString, firstTitle))
print()
# Missing fields #
if config.anyMissingFields:
key2availability = nanny.getFieldAvailabilities(entries)
if key2availability:
print(HEADLINE_PATTERN.format("Missing fields"))
for key, availability in key2availability.items():
missingRequiredFields = availability[nanny.FIELD_IS_REQUIRED_MISSING]
missingOptionalFields = availability[nanny.FIELD_IS_OPTIONAL_MISSING]
if config.anyMissingFields and (missingRequiredFields or missingOptionalFields):
print("Entry {}".format(key))
if config.missingRequiredFields and missingRequiredFields:
print(" Required missing: ", ', '.join(missingRequiredFields))
if config.missingOptionalFields and missingOptionalFields:
print(" Optional missing: ", ', '.join(missingOptionalFields))
print()
# Bad Formatting #
# Unsecured uppercase characters in titles
# Todo: Identify over-eager use of curly braces, e.g. across multiple words
# Todo: Add option to prefer braces around full words instead of single characters
# Todo: Improve search of unsecured characters to not break when double braces are used
if config.unsecuredTitleChars:
key2unsecuredChars = nanny.findUnsecuredUppercase(entries, field="title")
if key2unsecuredChars:
print(HEADLINE_PATTERN.format("Titles with uppercase characters that are not secured by curly braces"))
for key in key2unsecuredChars:
title = entries[key][nanny.FIELD_TITLE]
print("Entry {} has unsecured uppercase characters: {}".format(key, title))
print()
# Unnecessary curly braces
if config.unnecessaryBraces:
print(NOT_IMPLEMENTED_PATTERN.format("unnecessary curly braces"))
# Bad page numbers
if config.badPageNumbers:
badPageNumberEntries = nanny.findBadPageNumbers(entries, tolerateSingleHyphens=False)
if badPageNumberEntries:
print(HEADLINE_PATTERN.format("Entries with badly formatted page numbers"))
for entry in badPageNumberEntries:
print("Entry {} has bad page number format: {}".format(entry.key, entry[nanny.FIELD_PAGES]))
print()
# Inconsistent Formatting #
# Inconsistent names for conferences
if config.inconsistentConferences:
print(NOT_IMPLEMENTED_PATTERN.format("inconsistent names for conferences"))
# Incomplete name formatting (e.g. first name is initials only or missing middle names found in other entry)
if config.incompleteNames:
print(NOT_IMPLEMENTED_PATTERN.format("incomplete name formatting"))
# Ambiguous name formatting (i.e. not following the "LAST, FIRST and LAST, FIRST" format)
if config.ambiguousNames:
print(NOT_IMPLEMENTED_PATTERN.format("ambigous name formatting"))
# All-caps name formatting
if config.allcapsNames:
for field in nanny.PERSON_NAME_FIELDS:
entrykey2CapsNames = nanny.findAllCapsName(entries, field)
if entrykey2CapsNames:
print(HEADLINE_PATTERN.format("{}s whose names are all-caps".format(field.capitalize())))
for key, capsnames in entrykey2CapsNames.items():
for capsname in capsnames:
print("Entry {} has {}s which are all-caps: {}".format(key, field, capsname.pretty()))
print()
# Inconsistent location names
if config.inconsistentLocations:
print(NOT_IMPLEMENTED_PATTERN.format("inconsistent location names"))
# Inconsistent inferrable information
if config.inconsistentInferrableInfo:
print(NOT_IMPLEMENTED_PATTERN.format("inconsistent inferrable information"))
# if nanny.warnings:
# print("===== Encountered Warnings =====")
# for warning in nanny.warnings:
# print(' {}'.format(warning))
# print()
def main():
parser = argparse.ArgumentParser(description='Check the consistency of BibTeX entries.')
parser.add_argument('bibtexfile')
parser.add_argument('-a', '--aux')
parser.add_argument('-c', '--config')
# TODO: Allow multiple bibtex files
args = parser.parse_args()
# Load BibTex file
entries = nanny.loadBibTex(args.bibtexfile)
# Load auxiliary file
if args.aux:
keyWhitelist = nanny.loadCitedKeys(args.aux)
entries = nanny.filterEntries(entries, keyWhitelist)
# Load config file
config = ConsistencyConfig(args.config)
# Processing
checkConsistency(entries, config)
if __name__ == '__main__':
main()