-
-
Notifications
You must be signed in to change notification settings - Fork 8
/
check.py
101 lines (86 loc) · 4.82 KB
/
check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import re
heb = 'א-ת'
lengths = {}
head_mid_tails = set()
head_mid_tail_len = 1
base_name = None
palin_anad_files = sorted(os.path.join('palindromes_anadromes', file) for file in os.listdir('palindromes_anadromes'))
for file in sorted(os.listdir()) + palin_anad_files:
ext = os.path.splitext(file)[1]
if not ext in ['.txt','.csv'] or 'rejected' in file or file=='requirements.txt':
continue
with open(file, encoding='utf8', newline='') as f:
text = f.read()
sep = '\r\n' if ext=='.csv' else '\n'
lines = text.split(sep)[:-1]
if ext=='.csv':
counts = [line.split(',')[1] for line in lines]
lines = [line.split(',')[0] for line in lines]
print(file, len(lines))
assert lines # file is not empty
assert text.endswith(sep), text.split(sep)[-2:] # wrong EOL or missing last empty line
bad = re.search(r'[\d]', file.replace('cc100',''))
assert not bad, bad # no digits which may signify temporary files
for line in lines:
if 'bigrams' in file:
assert line.count(' ')==1, (line, line.count(' '))
extra = ''
if 'prefixes' in file or 'family' in file or 'place' in file:
extra += '-'
if 'bigrams' in file or 'family' in file or 'place' in file:
extra += ' '
bad = re.search(r'(^|\W)\'|^["\s-]|["\s-]$|["\s-]{2,}|[^\'"' + heb + extra +']', line)
assert not bad, (bad, line) # no non-hebrew chars
name, ext = os.path.splitext(file)
if name.endswith('fatverb'):
base_name = name
base_set = set(lines)
elif base_name and name.startswith(base_name) and not name.startswith(base_name+'_only_'):
assert set(lines)>base_set, (len(lines), len(base_set), len(base_set-set(lines))) # make sure prefix files also include non-prefixed words
if ext=='.csv':
for line in counts:
bad = re.search('r[^\d]', line)
assert not bad, (bad, line) # only numbers for counts
head_mid_tail = tuple(lines[:head_mid_tail_len] + lines[len(lines)//2-head_mid_tail_len//2:len(lines)//2-head_mid_tail_len//2+head_mid_tail_len] + lines[-head_mid_tail_len:])
assert head_mid_tail not in head_mid_tails
head_mid_tails.add(head_mid_tail) # check that no files are identical
empty = sum(not line.strip() for line in lines)
assert not empty, empty # no empty lines
lengths[file] = len(lines)
len_set = len(set(lines))
assert len(lines) == len_set, (len(lines), len_set) # no duplicate lines
if ext=='.csv':
lines_counts = list(zip(lines,counts))
sorted_lines = sorted(lines_counts, key=lambda x:(-int(x[1]),x[0]))
assert lines_counts == sorted_lines, (lines_counts[:5], sorted_lines[:5]) # lines are sorted
elif 'append' not in file and 'intersect' not in file:
sorted_lines = sorted(lines)
assert lines == sorted_lines, (lines[:5], sorted_lines[:5]) # lines are sorted
if ext!='.csv' and 'palin' not in file and 'anad' not in file:
if 'with_fatverb' in file and 'prefixes' not in file:
assert lengths[file] == lengths[file.replace('with_fatverb', 'append_fatverb')], (
lengths[file], lengths[file.replace('with_fatverb', 'append_fatverb')])
assert lengths[file] == lengths[file.replace('with_fatverb', 'no_fatverb')] + lengths[
file.replace('with_fatverb', 'only_fatverb')], (lengths[file],
lengths[file.replace('with_fatverb', 'no_fatverb')] +
lengths[file.replace('with_fatverb', 'only_fatverb')],
lengths[file.replace('with_fatverb', 'no_fatverb')],
lengths[file.replace('with_fatverb', 'only_fatverb')])
if 'with_some_prefixes' in file:
assert lengths[file] == lengths[file.replace('with_some_prefixes', 'append_some_prefixes')], (
lengths[file], lengths[file.replace('with_some_prefixes', 'append_some_prefixes')])
assert lengths['all_with_fatverb_append_some_prefixes.txt'] == lengths[
'all_append_fatverb_after_append_some_prefixes.txt'] == lengths[
'all_append_fatverb_before_append_some_prefixes.txt'], (
lengths['all_with_fatverb_append_some_prefixes.txt'], lengths['all_append_fatverb_after_append_some_prefixes.txt'],
lengths['all_append_fatverb_before_append_some_prefixes.txt'])
#check readonly
for file in sorted(os.listdir()):
if not os.path.splitext(file)[1] in ['.txt','.csv']:
continue
try:
open(file, 'a', encoding='utf8')
except Exception:
continue
assert False, file+' is not readonly'