huggingface · ydshieh · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024 · ydshieh
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
@@ -57,9 +57,10 @@ When you load a model explicitly, you can inspect the generation configuration t
 >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
 >>> model.generation_config
 GenerationConfig {
-    "bos_token_id": 50256,
-    "eos_token_id": 50256,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256
 }
+<BLANKLINE>
 ```
 
 Printing out the `model.generation_config` reveals only the values that are different from the default generation
@@ -244,8 +245,7 @@ To enable multinomial sampling set `do_sample=True` and `num_beams=1`.
 
 >>> outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Today was an amazing day because when you go to the World Cup and you don\'t, or when you don\'t get invited,
-that\'s a terrible feeling."']
+["Today was an amazing day because we received these wonderful items by the way of a gift shop. The box arrived on a Thursday and I opened it on Monday afternoon to receive the gifts. Both bags featured pieces from all the previous years!\n\nThe box had lots of surprises in it, including some sweet little mini chocolate chips! I don't think I'd eat all of these. This was definitely one of the most expensive presents I have ever got, I actually got most of them for free!\n\nThe first package came"]
 ```
 
 ### Beam-search decoding
@@ -393,7 +393,7 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
 >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are going to the same party. It is a small party, in a small']
+['Alice and Bob, a couple of friends of mine, who are both in the same office as']
 ```
 
 Alternativelly, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed

diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md
@@ -65,20 +65,20 @@ After conversion, the model and tokenizer can be loaded via:
 >>> tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
 >>> model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf")
 >>> PROMPT = '''def remove_non_ascii(s: str) -> str:
-    """ <FILL_ME>
-    return result
-'''
+...     """ <FILL_ME>
+...     return result
+... '''
 >>> input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
 >>> generated_ids = model.generate(input_ids, max_new_tokens=128)
 
 >>> filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
 >>> print(PROMPT.replace("<FILL_ME>", filling))
 def remove_non_ascii(s: str) -> str:
     """ Remove non-ASCII characters from a string.
-
+<BLANKLINE>
     Args:
         s: The string to remove non-ASCII characters from.
-
+<BLANKLINE>
     Returns:
         The string with non-ASCII characters removed.
     """
@@ -87,6 +87,7 @@ def remove_non_ascii(s: str) -> str:
         if ord(c) < 128:
             result += c
     return result
+<BLANKLINE>
 ```
 
 If you only want the infilled part:

diff --git a/docs/source/en/model_doc/phi.md b/docs/source/en/model_doc/phi.md
@@ -92,7 +92,9 @@ Phi-2 has been integrated in the development version (4.37.0.dev) of `transforme
 >>> outputs = model.generate(**inputs, max_length=30)
 >>> text = tokenizer.batch_decode(outputs)[0]
 >>> print(text)
-'Can you help me write a formal email to a potential business partner proposing a joint venture?\nInput: Company A: ABC Inc.\nCompany B: XYZ Ltd.\nJoint Venture: A new online platform for e-commerce'
+Can you help me write a formal email to a potential business partner proposing a joint venture?
+Input: Company A: ABC Inc.
+Company B
 ```
 
 ### Example :
@@ -134,7 +136,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 >>> from transformers import PhiForCausalLM, AutoTokenizer
 
 >>> # define the model and tokenizer and push the model and tokens to the GPU.
->>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda")
+>>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda")  # doctest: +SKIP
 >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
 
 >>> # feel free to change the prompt to your liking.
@@ -144,9 +146,9 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 >>> tokens = tokenizer(prompt, return_tensors="pt").to("cuda")
 
 >>> # use the model to generate new tokens.
->>> generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10)
+>>> generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10)  # doctest: +SKIP
 
->>> tokenizer.batch_decode(generated_output)[0]
+>>> tokenizer.batch_decode(generated_output)[0]  # doctest: +SKIP
 'If I were an AI that had just achieved a breakthrough in machine learning, I would be thrilled'
 ```
 

diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md
@@ -37,19 +37,21 @@ We also provide `StableLM Zephyr 3B`, an instruction fine-tuned version of the m
 The following code snippet demonstrates how to use `StableLM 3B 4E1T` for inference:
 
 ```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 >>> device = "cuda" # the device to load the model onto
 
+>>> set_seed(0)
+
 >>> tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t")
 >>> model = AutoModelForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t")
->>> model.to(device)
+>>> model.to(device)  # doctest: +IGNORE_RESULT
 
 >>> model_inputs = tokenizer("The weather is always wonderful in", return_tensors="pt").to(model.device)
 
 >>> generated_ids = model.generate(**model_inputs, max_length=32, do_sample=True)
 >>> responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 >>> responses
-['The weather is always wonderful in Santa Barbara and, for visitors hoping to make the move to our beautiful seaside city, this town offers plenty of great places to...']
+['The weather is always wonderful in Costa Rica, which makes it a prime destination for retirees. That’s where the Pensionado program comes in, offering']
 ```
 
 ## Combining StableLM and Flash Attention 2
@@ -66,19 +68,21 @@ Now, to run the model with Flash Attention 2, refer to the snippet below:
 
 ```python
 >>> import torch
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 >>> device = "cuda" # the device to load the model onto
 
+>>> set_seed(0)
+
 >>> tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t")
->>> model = AutoModelForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2")
->>> model.to(device)
+>>> model = AutoModelForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2")  # doctest: +SKIP
+>>> model.to(device)  # doctest: +SKIP
 
 >>> model_inputs = tokenizer("The weather is always wonderful in", return_tensors="pt").to(model.device)
 
->>> generated_ids = model.generate(**model_inputs, max_length=32, do_sample=True)
->>> responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
->>> responses
-['The weather is always wonderful in Santa Barbara and, for visitors hoping to make the move to our beautiful seaside city, this town offers plenty of great places to...']
+>>> generated_ids = model.generate(**model_inputs, max_length=32, do_sample=True)  # doctest: +SKIP
+>>> responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)  # doctest: +SKIP
+>>> responses  # doctest: +SKIP
+['The weather is always wonderful in Costa Rica, which makes it a prime destination for retirees. That’s where the Pensionado program comes in, offering']
 ```
 
 

diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md
@@ -42,11 +42,10 @@ These ready-to-use checkpoints can be downloaded and used via the HuggingFace Hu
 >>> prompt = "def print_hello_world():"
 
 >>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
->>> model.to(device)
 
 >>> generated_ids = model.generate(**model_inputs, max_new_tokens=10, do_sample=False)
 >>> tokenizer.batch_decode(generated_ids)[0]
-"def print_hello_world():\n\treturn 'Hello World!'"
+'def print_hello_world():\n    print("Hello World!")\n\ndef print'
 ```
 
 ## Starcoder2Config

diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md
@@ -309,7 +309,7 @@ The predicted tokens will then be placed between the sentinel tokens.
 >>> sequence_ids = model.generate(input_ids)
 >>> sequences = tokenizer.batch_decode(sequence_ids)
 >>> sequences
-['<pad><extra_id_0> park offers<extra_id_1> the<extra_id_2> park.</s>']
+['<pad> <extra_id_0> park offers <extra_id_1> the <extra_id_2> park.</s>']
 ```
 
 ## Performance

diff --git a/docs/source/en/tasks/prompting.md b/docs/source/en/tasks/prompting.md
@@ -80,7 +80,7 @@ Run inference with decoder-only models with the `text-generation` pipeline:
 >>> prompt = "Hello, I'm a language model"
 
 >>> generator(prompt, max_length = 30)
-[{'generated_text': "Hello, I'm a language model expert, so I'm a big believer in the concept that I know very well and then I try to look into"}]
+[{'generated_text': "Hello, I'm a language model programmer so you can use some of my stuff. But you also need some sort of a C program to run."}]
 ```
 
 To run inference with an encoder-decoder, use the `text2text-generation` pipeline:
@@ -284,7 +284,7 @@ the leading word or phrase (`"Answer:"`) to nudge the model to start generating
 
 >>> for seq in sequences:
 ...     print(f"Result: {seq['generated_text']}")
-Result: Modern tools are used, such as immersion blenders
+Result: Modern tools often used to make gazpacho include
 ```
 
 #### Reasoning