Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update-llama-code #25826

Merged
merged 19 commits into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions docs/source/en/model_doc/code_llama.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ Here is a sample usage
python src/transformers/models/llama/convert_llama_weights_to_hf.py \
--input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
```
Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
muellerzr marked this conversation as resolved.
Show resolved Hide resolved

- After conversion, the model and tokenizer can be loaded via:

Expand Down Expand Up @@ -90,8 +92,8 @@ If you only want the infilled part:
>>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
>>> generator('def remove_non_ascii(s: str) -> str:\n """ <FILL_ME>\n return result', max_new_tokens = 128, return_type = 1)
```
Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed.

Under the hood, the tokenizer [automatically splits by `<FILL_ME>`](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug.

- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.

Expand Down
60 changes: 37 additions & 23 deletions src/transformers/models/code_llama/tokenization_code_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
there is no padding token in the original model.

Args:
The default configuration match that of the model []() which supports prompt infilling.
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
vocab_file (`str`):
Path to the vocabulary file.
eos_token (`str`, *optional*, defaults to `"</s>"`):
Expand All @@ -80,8 +81,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
Prefix token used for infilling.
suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
Expand Down Expand Up @@ -111,7 +110,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):

- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.

use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -125,7 +125,6 @@ def __init__(
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token=None,
prefix_token="▁<PRE>",
middle_token="▁<MID>",
suffix_token="▁<SUF>",
Expand All @@ -136,23 +135,26 @@ def __init__(
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
additional_special_tokens=None,
use_default_system_prompt=False,
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
**kwargs,
):
requires_backends(self, "protobuf")
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token

self.use_default_system_prompt = use_default_system_prompt
# mark tokens special to skip them
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token]
additional_special_tokens = additional_special_tokens or []
for token in [prefix_token, middle_token, suffix_token, eot_token]:
additional_special_tokens += [token] if token is not None else []

super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
prefix_token=prefix_token,
Expand All @@ -164,6 +166,7 @@ def __init__(
suffix_first=suffix_first,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
additional_special_tokens=additional_special_tokens,
use_default_system_prompt=use_default_system_prompt,
**kwargs,
)
self.vocab_file = vocab_file
Expand Down Expand Up @@ -239,6 +242,7 @@ def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.get_piece_size()

# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
Expand All @@ -247,7 +251,7 @@ def get_vocab(self):

def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]:
# add a prefix space to `prefix`
if self.fill_token in prefix and suffix is None:
if self.fill_token is not None and self.fill_token in prefix and suffix is None:
prefix, suffix = prefix.split(self.fill_token)

if len(prefix) > 0:
Expand All @@ -263,9 +267,9 @@ def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[in

if None in (self.prefix_id, self.middle_id, self.suffix_id):
raise ValueError(
"Then input includes a `prefix` and a `suffix` used for the infilling task,"
" the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
"The input either includes a `prefix` and a `suffix` used for the infilling task,"
f" or can be split on the {self.fill_token} token, creating a suffix and prefix,"
" but the model does not support `infilling`."
)
suffix_tokens = self._tokenize(suffix) # make sure CodeLlama sp model does not mess up

Expand Down Expand Up @@ -293,10 +297,12 @@ def _tokenize(self, text, **kwargs):
# 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens

# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)

# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
Expand All @@ -320,6 +326,7 @@ def convert_tokens_to_string(self, tokens):
out_string += self.sp_model.decode(current_sub_tokens)
return out_string

# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Expand Down Expand Up @@ -347,6 +354,7 @@ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None)

return (out_vocab_file,)

# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
Expand All @@ -358,6 +366,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):

return output

# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
Expand Down Expand Up @@ -395,6 +404,7 @@ def get_special_tokens_mask(
+ eos_token_id
)

# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
Expand Down Expand Up @@ -443,7 +453,7 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
>>> from transformers import Conversation

>>> Conversation(
... "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
... "<<SYS>>\n Complete the functions without any documentation\n<</SYS>>\n\n `def remove_non_ascii(s: str) -> str:`"
... ) # doctest: +IGNORE_RESULT
```
Args:
Expand All @@ -453,16 +463,20 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
`List[int]`:
Input ids for the conversation.
"""
if len(conversation.past_user_inputs) > 0:
if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
conversation.past_user_inputs[0] = (
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
)
elif conversation.new_user_input:
if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
else:
raise ValueError("Last message must be from user")
if self.use_default_system_prompt:
if len(conversation.past_user_inputs) > 0:
if (
not conversation.past_user_inputs[0].startswith(B_SYS)
or E_SYS not in conversation.past_user_inputs[0]
):
conversation.past_user_inputs[0] = (
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
)
elif conversation.new_user_input:
if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
else:
raise ValueError("Last message must be from user")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we not want to check that the conversation ids start with B_SYS and contain E_SYS even if we're not using the default prompt?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No this was just to add the system prompt if there are no system prompt. Now we just let the user define the system prompt!


dialogue = list(conversation.iter_texts())
if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
The token used to split the input between the prefix and suffix.
suffix_first (`bool`, *optional*, default to `False`):
Whether the input prompt and suffix should be formatted with the suffix first.
additional_special_tokens (`List[str]`, *optional*):
Additional special tokens used by the tokenizer.
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
Whether or not the default system prompt for Llama should be used.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -124,13 +128,18 @@ def __init__(
suffix_token="▁<SUF>",
eot_token="▁<EOT>",
fill_token="<FILL_ME>",
additional_special_tokens=None,
add_bos_token=True,
add_eos_token=False,
use_default_system_prompt=False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The additional args should be documented in the doc string

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed thanks

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(not for this PR) The add_bos_token and add_eos_token are not documented, and the args are in a very different order than the docstring

**kwargs,
):
# mark tokens special to skip them
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token]
additional_special_tokens = additional_special_tokens or []
for token in [prefix_token, middle_token, suffix_token, eot_token]:
additional_special_tokens += [token] if token is not None else []
self.use_default_system_prompt = use_default_system_prompt

super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
Expand All @@ -144,6 +153,7 @@ def __init__(
suffix_token=suffix_token,
eot_token=eot_token,
fill_token=fill_token,
use_default_system_prompt=use_default_system_prompt,
**kwargs,
)
self._add_bos_token = add_bos_token
Expand All @@ -162,6 +172,7 @@ def __init__(
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

# Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
def update_post_processor(self):
"""
Updates the underlying post processor with the current `bos_token` and `eos_token`.
Expand Down Expand Up @@ -300,6 +311,7 @@ def encode_plus(self, text, text_pair=None, suffix_first=False, add_special_toke
self.set_infilling_processor(True)
return tokens

# Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
Expand Down Expand Up @@ -343,12 +355,12 @@ def build_inputs_with_special_tokens(
Returns:
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
# TODO process the ids for fast? Or update the template processing for infilling task when using `tokenize_infilling`
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
return self.bos_token_id + token_ids_0 + self.eos_token_id
return self.bos_token_id + token_ids_0 + token_ids_1 + self.eos_token_id

def _build_conversation_input_ids(self, conversation: "Conversation"):
# Copied from transformers.models.code_llama.tokenization_code_llama.CodeLlamaTokenizer._build_conversation_input_ids
def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
r"""Builds the input ids for a conversation.
This is the format used in the provided examples. System prompts should be manually added at the beginning of
the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
Expand All @@ -363,7 +375,7 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
>>> from transformers import Conversation

>>> Conversation(
... "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
... "<<SYS>>\n Complete the functions without any documentation\n<</SYS>>\n\n `def remove_non_ascii(s: str) -> str:`"
... ) # doctest: +IGNORE_RESULT
```
Args:
Expand All @@ -373,16 +385,20 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
`List[int]`:
Input ids for the conversation.
"""
if len(conversation.past_user_inputs) > 0:
if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
conversation.past_user_inputs[0] = (
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
)
elif conversation.new_user_input:
if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
else:
raise ValueError("Last message must be from user")
if self.use_default_system_prompt:
if len(conversation.past_user_inputs) > 0:
if (
not conversation.past_user_inputs[0].startswith(B_SYS)
or E_SYS not in conversation.past_user_inputs[0]
):
conversation.past_user_inputs[0] = (
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
)
elif conversation.new_user_input:
if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
else:
raise ValueError("Last message must be from user")

dialogue = list(conversation.iter_texts())
if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
Expand All @@ -392,7 +408,7 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
"The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
)

dialog_tokens = []
dialog_tokens: List[int] = []
dialog_tokens += sum(
[
[self.bos_token_id]
Expand Down
9 changes: 7 additions & 2 deletions tests/models/code_llama/test_tokenization_code_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ def setUp(self):
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname)

def test_no_infilling_init(self):
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True)
with self.assertRaises(ValueError):
tokenizer.tokenize("This is <FILL_ME> prefix")

def test_full_tokenizer(self):
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)

Expand Down Expand Up @@ -587,8 +592,8 @@ def main():
end
""",
]
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-hf")
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the other model does not support infiling

tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")

formatted_prompt = tokenizer.tokenize(PROMPTS[0])
self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0]))
Expand Down