Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 241 additions & 0 deletions tests/torchtune/models/phi3/test_phi3_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def tokenizer(self):

def test_tokenize_messages(self, tokenizer):
messages = [
Message(role="system", content="You are a helpful assistant", masked=True),
Message(
role="user",
content="Below is an instruction that describes a task. Write a response "
Expand All @@ -40,6 +41,246 @@ def test_tokenize_messages(self, tokenizer):
),
]
tokens, mask = tokenizer.tokenize_messages(messages)
expected_tokens = [
1,
32006,
272,
84,
9,
615,
454,
1974,
19,
32007,
32010,
323,
418,
202,
31,
128,
15,
120,
47,
88,
584,
23,
1665,
182,
9,
434,
295,
85,
4,
780,
47,
636,
9,
1094,
213,
23,
9,
69,
69,
164,
1153,
299,
35,
961,
132,
237,
7,
5,
761,
4,
12,
0,
313,
120,
47,
88,
584,
166,
493,
171,
54,
299,
9,
906,
244,
19,
186,
767,
303,
671,
92,
209,
24,
190,
52,
38,
4,
12,
0,
1243,
7,
69,
135,
213,
166,
32007,
32001,
6,
21,
45,
128,
71,
58,
38,
14,
10,
652,
35,
462,
101,
1306,
7,
341,
171,
20,
14,
127,
26,
652,
7,
10,
1268,
4,
6,
21,
45,
591,
9,
566,
22,
994,
913,
38,
20,
52,
24,
10,
1306,
734,
14,
71,
365,
1382,
7,
10,
801,
105,
88,
244,
985,
7,
4,
6,
21,
45,
9,
566,
126,
180,
11,
5,
1137,
7,
10,
1089,
151,
8,
1156,
213,
342,
7,
10,
384,
104,
54,
470,
4,
6,
21,
45,
287,
14,
33,
125,
135,
24,
101,
512,
66,
7,
28,
822,
15,
542,
69,
59,
110,
14,
365,
229,
7,
3,
36,
267,
36,
125,
135,
24,
101,
1503,
182,
9,
222,
1661,
191,
332,
92,
92,
24,
24,
4,
32007,
]

expected_mask = [True] * 86 + [False] * 126
assert expected_tokens == tokens
assert expected_mask == mask

def test_tokenize_messages_no_system_prompt(self, tokenizer):
messages = [
Message(role="system", content="You are a helpful assistant", masked=True),
Message(
role="user",
content="Below is an instruction that describes a task. Write a response "
"that appropriately completes the request.\n\n### Instruction:\nGenerate "
"a realistic dating profile bio.\n\n### Response:\n",
masked=True,
),
Message(
role="assistant",
content="I'm an outgoing and friendly person who loves spending time with "
"friends and family. I'm also a big-time foodie and love trying out new "
"restaurants and different cuisines. I'm a big fan of the arts and enjoy "
"going to museums and galleries. I'm looking for someone who shares my "
"interest in exploring new places, as well as someone who appreciates a "
"good conversation over coffee.",
),
]
tokens, mask = tokenizer.tokenize_messages(messages, ignore_system_prompt=True)
expected_tokens = [
1,
32010,
Expand Down
6 changes: 3 additions & 3 deletions torchtune/models/phi3/_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def tokenize_messages(
messages: List[Message],
*,
add_eos: bool = False,
ignore_system_prompts: bool = True,
ignore_system_prompt: bool = False,
) -> Tuple[List[int], List[bool]]:
r"""Tokenize a list of messages one at a time then concatenate them,
returning a list of tokens and a list of masks.
Expand All @@ -151,7 +151,7 @@ def tokenize_messages(
messages (List[Message]): A list of messages, each containing role, content,
and masked attributes.
add_eos (bool): Whether to append EOS after assistant message, default to False
ignore_system_prompts (bool): Whether to ignore system prompts. This matches the HF implementation, default to True.
ignore_system_prompt (bool): Whether to ignore system prompt, defaults to False.

Raises:
ValueError: If the role is not "user", "assistant", or "system".
Expand All @@ -175,7 +175,7 @@ def tokenize_messages(

for message in templated_messages:
# Skip system prompt
if ignore_system_prompts and message.role == "system":
if ignore_system_prompt and message.role == "system":
continue

# Prepend BOS on start of new turns
Expand Down
Loading