-
Notifications
You must be signed in to change notification settings - Fork 1
/
dictionary.py
124 lines (101 loc) · 4.26 KB
/
dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Compression dictionary generator for the Appareden English translation.
"""
import os
from collections import OrderedDict
from rominfo import FILE_BLOCKS, SRC_DISK, DEST_DISK, SHADOFF_COMPRESSED_EXES
from rominfo import DUMP_XLS_PATH, POINTER_XLS_PATH
from rominfo import ITEM_NAME_CATEGORIES, DICTIONARY_LOCATION
from romtools.disk import Disk, Gamefile, Block
from romtools.dump import DumpExcel, PointerExcel
Dump = DumpExcel(DUMP_XLS_PATH)
PtrDump = PointerExcel(POINTER_XLS_PATH)
OriginalAp = Disk(SRC_DISK, dump_excel=Dump, pointer_excel=PtrDump)
TargetAp = Disk(DEST_DISK)
DICTIONARY_FILES = ['ORFIELD.EXE', 'ORBTL.EXE']
for filename in DICTIONARY_FILES:
print('\n', filename, '\n\n')
gamefile_path = os.path.join('original', filename)
if not os.path.isfile(gamefile_path):
OriginalAp.extract(filename, path_in_disk='TGL/OR', dest_path='original')
gamefile = Gamefile(gamefile_path, disk=OriginalAp, dest_disk=TargetAp)
block_objects = [Block(gamefile, block) for block in FILE_BLOCKS[filename]]
words = {}
for block in block_objects:
previous_text_offset = block.start
overflowing = False
overflow_start = 0
diff = 0
not_translated = False
last_i = -1
last_len = 1
last_string_original_location = 0
for t in Dump.get_translations(block):
# Ignore the dictionary slot itself
if t.location == DICTIONARY_LOCATION[filename]:
continue
if filename == 'ORFIELD.EXE':
if t.category in ITEM_NAME_CATEGORIES:
continue
for w in t.english.split():
if len(w) > 2:
if w in words:
words[w] += 1
else:
words[w] = 1
ctrl_codes = OrderedDict()
# Tertiary sort to get consistent results: alphabetical sort
words = list((sorted(words.items(), key=lambda x: x[0])))
# Secondary sort, sort it by the length of the word
words = list((sorted(words, key=lambda x: len(x[0]))))
#print(words)
# Primary sort, sort by frequency
candidates = list(reversed(sorted(words, key=lambda x: x[1])))
candidates = [c for c in candidates if c[1] > 1]
print(candidates)
dictstring = b'Restore Pill [00]'
cursor = len(dictstring) - 3
for c in candidates[:1000]:
upper_present, lower_present = False, False
if c[0].capitalize() in ctrl_codes and filename in SHADOFF_COMPRESSED_EXES:
continue
if b'\x82n' in c[0]:
continue
if b'.' in c[0]:
continue
if b'[00]' in c[0]:
continue
if b'\x81\x40' in c[0]:
continue
# Skipping
if any([cc in c[0] for cc in (b'[o]', b'[O]', b'[u]', b'[U]')]):
continue
if c[0] != b'[BLANK]' and c[0].strip(b'~').strip(b'[00]') != b'':
if len(dictstring.replace(b'[ee]', b'0').replace(b'[00]', b'0')) + len(c[0]) + 2 > 3500:
print("Couldn't fit %s next" % c[0])
break
# TODO: Also need to include if a capitalized version is in a substring somewhere.
# IE battle -> Auto-Battle, for -> Forged
if filename in SHADOFF_COMPRESSED_EXES:
if c[0].capitalize() in [w[0] for w in words]:
upper_present = True
if upper_present:
dictstring += b'^'
ctrl_codes[b'^' + c[0].capitalize()] = (cursor + 0xf000).to_bytes(2, byteorder='big')
cursor += 1
dictstring += c[0].capitalize()
dictstring += b'[ee]'
ctrl_codes[c[0].capitalize()] = (cursor + 0xf000).to_bytes(2, byteorder='big')
cursor += len(c[0])
cursor += 1
# If we're not doing Shadoff compression, just ignore caps
else:
ctrl_codes[c[0]] = (cursor + 0xf000).to_bytes(2, byteorder='big')
cursor += len(c[0])
cursor += 1
print(cursor)
dictstring += c[0]
dictstring += b'[ee]'
print(dictstring)
for c in ctrl_codes:
print(" (%s, %s)," % (c, ctrl_codes[c]))