-
Notifications
You must be signed in to change notification settings - Fork 791
/
generate_tests.py
132 lines (102 loc) · 4.03 KB
/
generate_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Helper file to dynamically generate unit tests
# This is done by running the python Transformers library and comparing its outputs with ours.
import json
import os
from transformers import AutoTokenizer, AutoConfig
from scripts.supported_models import SUPPORTED_MODELS
# List of tokenizers where the model isn't yet supported, but the tokenizer is
ADDITIONAL_TOKENIZERS_TO_TEST = {
'RefinedWebModel': [
'tiiuae/falcon-7b',
],
"llama": [
"hf-internal-testing/llama-tokenizer",
],
'mpt': [
'mosaicml/mpt-7b',
],
}
TOKENIZER_TEST_DATA = {
"shared": [
"hello world",
"Hello World",
"How are you doing?",
"You should've done this",
"A\n'll !!to?'d''d of, can't.",
"def main():\n\tpass",
"This\n\nis\na\ntest.",
"let a = obj.toString();\ntoString();",
'Hi Hello',
"trailing space ",
" leading space",
"生活的真谛是",
"The company was founded in 2016.",
"test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test",
"I bought an apple for $1.00 at the store.",
"you… ",
"\u0079\u006F\u0075\u2026\u00A0\u00A0",
"\u0079\u006F\u0075\u2026\u00A0\u00A0\u0079\u006F\u0075\u2026\u00A0\u00A0",
],
"custom": {
"tiiuae/falcon-7b": [
"12 and 123 and 1234", # Special case for splitting on 3 numbers
]
},
}
def generate_tokenizer_tests():
results = {}
tokenizers_to_test = list(SUPPORTED_MODELS.items()) + list(ADDITIONAL_TOKENIZERS_TO_TEST.items())
for model_type, tokenizer_names in tokenizers_to_test:
print(f'Generating tests for {model_type}')
for tokenizer_name in tokenizer_names:
print(' -', tokenizer_name)
try:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
except KeyError:
# If a KeyError is raised from the AutoTokenizer, it means the model
# does not use a tokenizer (e.g., vision models)
continue
tokenizer_results = []
shared_texts = TOKENIZER_TEST_DATA["shared"]
custom_texts = TOKENIZER_TEST_DATA["custom"].get(
tokenizer_name, [])
# Run tokenizer on test cases
for text in shared_texts + custom_texts:
# TODO: add with_pair option
encoded = tokenizer(text).data
decoded_with_special = tokenizer.decode(
encoded["input_ids"], skip_special_tokens=False)
decoded_without_special = tokenizer.decode(
encoded["input_ids"], skip_special_tokens=True)
tokenizer_results.append(dict(
input=text,
encoded=encoded,
decoded_with_special=decoded_with_special,
decoded_without_special=decoded_without_special,
))
results[tokenizer_name] = tokenizer_results
return results
def generate_config_tests():
results = {}
for model_type, config_names in SUPPORTED_MODELS.items():
for config_name in config_names:
# Load config
config = AutoConfig.from_pretrained(config_name)
results[config_name] = config.to_dict()
# TODO: Remove after https://github.com/huggingface/transformers/issues/23876 fixed
results[config_name].pop('torch_dtype', None)
return results
def main():
# TODO add option to cache generated data + force build tests
data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "data",
)
tokenizer_tests = generate_tokenizer_tests()
with open(os.path.join(data_dir, "tokenizer_tests.json"), "w", encoding="utf-8") as fp:
json.dump(tokenizer_tests, fp)
config_tests = generate_config_tests()
with open(os.path.join(data_dir, "config_tests.json"), "w", encoding="utf-8") as fp:
json.dump(config_tests, fp)
if __name__ == "__main__":
main()