Update-llama-code (#25826)

* some bug fixes * updates * Update code_llama.md Co-authored-by: Omar Sanseviero <osanseviero@users.noreply.github.com> * Add co author Co-authored-by: pcuenca <pedro@latenitesoft.com> * add a test * fixup * nits * some updates * fix-coies * adress comments * nits * nits * fix docsting * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * update * add int for https://huggingface.co/spaces/hf-accelerate/model-memory-usage --------- Co-authored-by: Omar Sanseviero <osanseviero@users.noreply.github.com> Co-authored-by: pcuenca <pedro@latenitesoft.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
huggingface · Sep 1, 2023 · a4dd53d · a4dd53d
1 parent 3587769
commit a4dd53d
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 46 deletions.
diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md
@@ -49,6 +49,8 @@ Here is a sample usage
 python src/transformers/models/llama/convert_llama_weights_to_hf.py \
     --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
 ```
+Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
 
 - After conversion, the model and tokenizer can be loaded via:
 
@@ -90,8 +92,8 @@ If you only want the infilled part:
 >>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
 >>> generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128, return_type = 1)
 ```
-Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed.
+
+Under the hood, the tokenizer [automatically splits by `<FILL_ME>`](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug.  To see how much CPU and GPU memory you need for this model or others, try [this calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) which can help determine that value.
 
 - The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
 

diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py
@@ -64,6 +64,10 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
     Construct a CodeLlama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as
     there is no padding token in the original model.
 
+    The default configuration match that of
+    [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
+    which supports prompt infilling.
+
     Args:
         vocab_file (`str`):
             Path to the vocabulary file.
@@ -80,8 +84,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
         unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (`str`, *optional*, defaults to `"<pad>"`):
-            The token used for padding, for example when batching sequences of different lengths.
         prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
             Prefix token used for infilling.
         suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
@@ -111,7 +113,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
 
             - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
-
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Llama should be used.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -125,7 +128,6 @@ def __init__(
         unk_token="<unk>",
         bos_token="<s>",
         eos_token="</s>",
-        pad_token=None,
         prefix_token="▁<PRE>",
         middle_token="▁<MID>",
         suffix_token="▁<SUF>",
@@ -136,23 +138,26 @@ def __init__(
         add_bos_token=True,
         add_eos_token=False,
         clean_up_tokenization_spaces=False,
+        additional_special_tokens=None,
+        use_default_system_prompt=False,
         **kwargs,
     ):
         requires_backends(self, "protobuf")
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
+        self.use_default_system_prompt = use_default_system_prompt
         # mark tokens special to skip them
-        additional_special_tokens = kwargs.pop("additional_special_tokens", [])
-        additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token]
+        additional_special_tokens = additional_special_tokens or []
+        for token in [prefix_token, middle_token, suffix_token, eot_token]:
+            additional_special_tokens += [token] if token is not None else []
+
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
             unk_token=unk_token,
-            pad_token=pad_token,
             add_bos_token=add_bos_token,
             add_eos_token=add_eos_token,
             prefix_token=prefix_token,
@@ -164,6 +169,7 @@ def __init__(
             suffix_first=suffix_first,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             additional_special_tokens=additional_special_tokens,
+            use_default_system_prompt=use_default_system_prompt,
             **kwargs,
         )
         self.vocab_file = vocab_file
@@ -239,6 +245,7 @@ def vocab_size(self):
         """Returns vocab size"""
         return self.sp_model.get_piece_size()
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
     def get_vocab(self):
         """Returns vocab as a dict"""
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
@@ -247,7 +254,7 @@ def get_vocab(self):
 
     def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]:
         # add a prefix space to `prefix`
-        if self.fill_token in prefix and suffix is None:
+        if self.fill_token is not None and self.fill_token in prefix and suffix is None:
             prefix, suffix = prefix.split(self.fill_token)
 
         if len(prefix) > 0:
@@ -263,9 +270,9 @@ def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[in
 
         if None in (self.prefix_id, self.middle_id, self.suffix_id):
             raise ValueError(
-                "Then input includes a `prefix` and a `suffix` used for the infilling task,"
-                " the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
-                f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
+                "The input either includes a `prefix` and a `suffix` used for the infilling task,"
+                f"  or can be split on the {self.fill_token} token, creating a suffix and prefix,"
+                " but the model does not support `infilling`."
             )
         suffix_tokens = self._tokenize(suffix)  # make sure CodeLlama sp model does not mess up
 
@@ -293,10 +300,12 @@ def _tokenize(self, text, **kwargs):
         # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
         return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         return self.sp_model.piece_to_id(token)
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         token = self.sp_model.IdToPiece(index)
@@ -320,6 +329,7 @@ def convert_tokens_to_string(self, tokens):
         out_string += self.sp_model.decode(current_sub_tokens)
         return out_string
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
     def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
         Save the vocabulary and special tokens file to a directory.
@@ -347,6 +357,7 @@ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None)
 
         return (out_vocab_file,)
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         bos_token_id = [self.bos_token_id] if self.add_bos_token else []
         eos_token_id = [self.eos_token_id] if self.add_eos_token else []
@@ -358,6 +369,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 
         return output
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
     def get_special_tokens_mask(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
@@ -395,6 +407,7 @@ def get_special_tokens_mask(
             + eos_token_id
         )
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
     def create_token_type_ids_from_sequences(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
@@ -443,7 +456,7 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
         >>> from transformers import Conversation
 
         >>> Conversation(
-        ...     "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
+        ...     "<<SYS>>\n Complete the functions without any documentation\n<</SYS>>\n\n `def remove_non_ascii(s: str) -> str:`"
         ... )  # doctest: +IGNORE_RESULT
         ```
         Args:
@@ -453,16 +466,20 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
             `List[int]`:
                 Input ids for the conversation.
         """
-        if len(conversation.past_user_inputs) > 0:
-            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
-                conversation.past_user_inputs[0] = (
-                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
-                )
-        elif conversation.new_user_input:
-            if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
-                conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
-        else:
-            raise ValueError("Last message must be from user")
+        if self.use_default_system_prompt:
+            if len(conversation.past_user_inputs) > 0:
+                if (
+                    not conversation.past_user_inputs[0].startswith(B_SYS)
+                    or E_SYS not in conversation.past_user_inputs[0]
+                ):
+                    conversation.past_user_inputs[0] = (
+                        B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+                    )
+            elif conversation.new_user_input:
+                if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
+                    conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
+            else:
+                raise ValueError("Last message must be from user")
 
         dialogue = list(conversation.iter_texts())
         if not all([is_user for is_user, msg in dialogue[::2]]) or not all(

diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@@ -73,7 +73,9 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
 
 
     This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
+    refer to this superclass for more information regarding those methods. The default configuration match that of
+    [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
+    which supports prompt infilling.
 
     Args:
         vocab_file (`str`):
@@ -104,6 +106,10 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
             The token used to split the input between the prefix and suffix.
         suffix_first (`bool`, *optional*, default to `False`):
             Whether the input prompt and suffix should be formatted with the suffix first.
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
+        use_default_system_prompt (`bool`, *optional*, defaults to `True`):
+            Whether or not the default system prompt for Llama should be used.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -124,13 +130,18 @@ def __init__(
         suffix_token="▁<SUF>",
         eot_token="▁<EOT>",
         fill_token="<FILL_ME>",
+        additional_special_tokens=None,
         add_bos_token=True,
         add_eos_token=False,
+        use_default_system_prompt=False,
         **kwargs,
     ):
         # mark tokens special to skip them
-        additional_special_tokens = kwargs.pop("additional_special_tokens", [])
-        additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token]
+        additional_special_tokens = additional_special_tokens or []
+        for token in [prefix_token, middle_token, suffix_token, eot_token]:
+            additional_special_tokens += [token] if token is not None else []
+        self.use_default_system_prompt = use_default_system_prompt
+
         super().__init__(
             vocab_file=vocab_file,
             tokenizer_file=tokenizer_file,
@@ -144,6 +155,7 @@ def __init__(
             suffix_token=suffix_token,
             eot_token=eot_token,
             fill_token=fill_token,
+            use_default_system_prompt=use_default_system_prompt,
             **kwargs,
         )
         self._add_bos_token = add_bos_token
@@ -162,6 +174,7 @@ def __init__(
     def can_save_slow_tokenizer(self) -> bool:
         return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
     def update_post_processor(self):
         """
         Updates the underlying post processor with the current `bos_token` and `eos_token`.
@@ -300,6 +313,7 @@ def encode_plus(self, text, text_pair=None, suffix_first=False, add_special_toke
         self.set_infilling_processor(True)
         return tokens
 
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if not self.can_save_slow_tokenizer:
             raise ValueError(
@@ -343,12 +357,12 @@ def build_inputs_with_special_tokens(
         Returns:
             `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
-        # TODO process the ids for fast? Or update the template processing for infilling task when using `tokenize_infilling`
         if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+            return self.bos_token_id + token_ids_0 + self.eos_token_id
+        return self.bos_token_id + token_ids_0 + token_ids_1 + self.eos_token_id
 
-    def _build_conversation_input_ids(self, conversation: "Conversation"):
+    # Copied from transformers.models.code_llama.tokenization_code_llama.CodeLlamaTokenizer._build_conversation_input_ids
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
         r"""Builds the input ids for a conversation.
         This is the format used in the provided examples. System prompts should be manually added at the beginning of
         the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
@@ -363,7 +377,7 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
         >>> from transformers import Conversation
 
         >>> Conversation(
-        ...     "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
+        ...     "<<SYS>>\n Complete the functions without any documentation\n<</SYS>>\n\n `def remove_non_ascii(s: str) -> str:`"
         ... )  # doctest: +IGNORE_RESULT
         ```
         Args:
@@ -373,16 +387,20 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
             `List[int]`:
                 Input ids for the conversation.
         """
-        if len(conversation.past_user_inputs) > 0:
-            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
-                conversation.past_user_inputs[0] = (
-                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
-                )
-        elif conversation.new_user_input:
-            if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
-                conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
-        else:
-            raise ValueError("Last message must be from user")
+        if self.use_default_system_prompt:
+            if len(conversation.past_user_inputs) > 0:
+                if (
+                    not conversation.past_user_inputs[0].startswith(B_SYS)
+                    or E_SYS not in conversation.past_user_inputs[0]
+                ):
+                    conversation.past_user_inputs[0] = (
+                        B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+                    )
+            elif conversation.new_user_input:
+                if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
+                    conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
+            else:
+                raise ValueError("Last message must be from user")
 
         dialogue = list(conversation.iter_texts())
         if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
@@ -392,7 +410,7 @@ def _build_conversation_input_ids(self, conversation: "Conversation"):
                 "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
             )
 
-        dialog_tokens = []
+        dialog_tokens: List[int] = []
         dialog_tokens += sum(
             [
                 [self.bos_token_id]