-
Notifications
You must be signed in to change notification settings - Fork 18
/
lib.py
122 lines (104 loc) · 3.88 KB
/
lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import re
import unicodedata
roman = [(1000, "M"), (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), (90, "XC"), (50, "L"), (40, "XL"), (10, "X"), (9, "IX"), (5, "V"), (4, "IV"), (1, "I")]
def generatelabelsrule(labels):
rules = []
guessed = False
for i, label in enumerate([str(j).strip() for j in labels]):
estimated = estimatelabel(label)
if label != guessed:
rules.append(estimated | {"startpage": i})
estimated["firstpagenum"] += 1
guessed = createlabel(estimated)
return rules
def estimatelabel(label):
# {"style": "D|r|R|a|A", "prefix": "", "firstpagenum": 0}
if re.fullmatch("[0-9]+", label) and label != "0" and str(int(label)) == label:
return {"style": "D", "prefix": "", "firstpagenum": int(label)}
elif m := re.fullmatch("([^0-9]+?)([0-9]+)", label):
return {"style": "D", "prefix": m.group(1), "firstpagenum": int(m.group(2))}
elif (m := re.fullmatch("(?<=^)(M{0,}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|m{0,}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3}))(?=$)", label)) and len(label) > 0:
return {"style": "R" if label.isupper() else "r", "prefix": "", "firstpagenum": destroyroman(m.group(0))}
else:
return {"style": "", "prefix": label, "firstpagenum": 0}
# Disable these three modes beacuse they are buggy and very rare
'''
elif m := re.fullmatch("(?<=^)(.+?)(?!$)(M{0,}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|m{0,}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3}))(?=$)", label):
return {"style": "R" if m.group(2).isupper() else "r", "prefix": m.group(1), "firstpagenum": destroyroman(m.group(2))}
# These two are completely broken, NEVER enable them
elif m := re.fullmatch("[a-z]+|[A-Z]+", label):
return {"style": "A" if label.isupper() else "a", "prefix": "", "firstpagenum": destroyalphabetical_fake(label)}
elif m := re.fullmatch("(?<=^)(.+?)(?!$)([a-z]+|[A-Z]+)(?=$)", label):
return {"style": "A" if m.group(2).isupper() else "a", "prefix": m.group(1), "firstpagenum": destroyalphabetical_fake(m.group(2))}
'''
def createlabel(rule):
match rule["style"]:
case "D":
return rule["prefix"] + str(rule["firstpagenum"])
case "r" | "R":
s = buildroman(rule["firstpagenum"])
return rule["prefix"] + (s if rule["style"].isupper() else s.lower())
case "a" | "A":
s = buildalphabetical(rule["firstpagenum"])
return rule["prefix"] + (s.upper() if rule["style"].isupper() else s)
case _:
return rule["prefix"]
def buildroman(n):
a = n
s = ""
while a > 0:
for i, j in roman:
k, l = divmod(a, i)
s += j * k
a = l
return s
def buildalphabetical(n):
ls = [chr(i) for i in range(97, 97 + 26)]
t, r = divmod(n, 26)
return ls[r + 1] * (t + 1)
def destroyalphabetical(s):
t = len(s) - 1
r = ord(s[0]) - 96
return 26 * t + r
def buildalphabetical_fake(n):
ls = [chr(i) for i in range(65, 65 + 26)]
i, a = 1, n
s = ""
while 26 ** i <= a:
a -= 26 ** i
i += 1
for j in reversed(range(i)):
f, a = divmod(a, 26 ** j)
s += ls[f]
return s
def destroyroman(s):
t = s.upper()
if not re.fullmatch("M{0,}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})", t):
raise ValueError(f"The string {s} is not a valid roman numeral!")
n = 0
while len(t) > 0:
for i, j in roman:
if t.startswith(j):
t = t.removeprefix(j)
n += i
return n
def destroyalphabetical_fake(s):
t = s.lower()
if not re.fullmatch("[a-z]+", t):
raise ValueError(f"The string {s} is not a valid alphabetical page!")
n = 0
for i, l in enumerate(reversed(t)):
n += int(26 ** i * (ord(l) - 96))
return n - 1
def cleantoc(toc):
newtoc = [toc[0]]
for i in range(1, len(toc)):
if toc[i - 1][0] == (toc[i][0] - 1) and toc[i - 1][1] == toc[i][1] and toc[i - 1][2] == toc[i][2]:
continue
else:
newtoc.append(toc[i])
return newtoc
def sanitizetitle(title):
value = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
value = re.sub(r'[^\w\s-]', '', value.lower())
return re.sub(r'[-\s]+', '-', value).strip('-_')