forked from the-turing-way/the-turing-way
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathno-bad-latin.py
157 lines (118 loc) · 4.5 KB
/
no-bad-latin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import re
import argparse
from pull_files import filter_files
HERE = os.getcwd()
ABSOLUTE_HERE = os.path.dirname(HERE)
IGNORE_LIST = ["config.yml", "style.md", "contributors-record.md", "references.bib"]
def parse_args():
"""Construct command line interface for parsing Pull Request number"""
DESCRIPTION = "Script to check for latin phrases in Markdown files"
parser = argparse.ArgumentParser(description=DESCRIPTION)
parser.add_argument(
"--pull-request",
type=str,
default=None,
help="If the script is being run on a Pull Request, parse the PR number",
)
return parser.parse_args()
def remove_comments(text_string):
"""
Function to omit html comment identifiers in a text string using
regular expression matches.
Arguments:
text_string {string} -- The text to be matched
Returns:
{string} -- The input text string with html comments removed
"""
p = re.sub("(?s)<!--(.*?)-->", "", text_string)
return p
def get_lines(text_string, sub_string):
"""
Get individual lines in a text file.
Arguments:
text_string {string} -- The text string to test
sub_string {string} -- The conditional string to perform splitting on
Returns:
{list} -- A list of split strings
"""
lines = [line for line in text_string.split("\n") if sub_string in line]
return lines
def construct_error_message(files_dict):
"""
Function to construct an error message pointing out where bad latin
phrases appear in lines of text.
Arguments:
files_dict {dictionary} -- Dictionary of failing files containing
bad latin phrases and offending lines
Returns:
{string} -- The error message to be raised
"""
error_message = ["Bad latin found in the following files:\n"]
for file in files_dict.keys():
error_message.append(
f"{file}:\t{files_dict[file]['latin_type']}\tfound in line\t[{files_dict[file]['line']}]\n"
)
return "\n".join(error_message)
def read_and_check_files(files):
"""
Function to read in files, remove html comments and check for bad latin phrases.
Arguments:
files {list} -- List of filenames to be checked
Returns:
{dict} -- Dictionary: Top level keys are absolute filepaths to files
that failed the check. Each of these has two keys:
'latin_type' containing the unwanted latin phrase, and 'line'
containing the offending line.
"""
failing_files = {}
bad_latin = ["i.e.", "e.g.", "e.t.c.", " etc", " ie ", "et cetera"]
for filename in files:
if os.path.basename(filename) in IGNORE_LIST:
pass
else:
try:
with open(
os.path.join(ABSOLUTE_HERE, filename), encoding="utf8",
errors="ignore") as f:
text = f.read()
text = remove_comments(text)
for latin_type in bad_latin:
if latin_type in text.lower():
lines = get_lines(text.lower(), latin_type)
for line in lines:
failing_files[os.path.abspath(filename)] = {
"latin_type": latin_type,
"line": line,
}
except FileNotFoundError:
pass
return failing_files
def get_all_files(directory=os.path.join(ABSOLUTE_HERE, "book", "website")):
"""
Get a list of files to be checked. Ignores image files.
Keyword Arguments:
directory {string} -- The directory containing the files to check
Returns:
{list} -- List of files to check
"""
files = []
filetypes_to_ignore = (".png", ".jpg")
for rootdir, _, filenames in os.walk(directory):
for filename in filenames:
if not filename.endswith(filetypes_to_ignore):
files.append(os.path.join(rootdir, filename))
return files
def main():
"""Main function"""
args = parse_args()
if args.pull_request is not None:
files = filter_files(args.pull_request, ignore_suffix=('.jpg', '.png'))
else:
files = get_all_files()
failing_files = read_and_check_files(files)
if bool(failing_files):
error_message = construct_error_message(failing_files)
raise Exception(error_message)
if __name__ == "__main__":
main()