Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 41 additions & 25 deletions docs/source/en/generation_strategies.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ Greedy search works well for tasks with relatively short outputs where creativit

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device

device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
# explicitly set to default length because Llama2 generation length is 4096
outputs = model.generate(**inputs, max_new_tokens=20)
tokenizer.batch_decode(outputs, skip_special_tokens=True)
Expand All @@ -52,12 +54,14 @@ Enable multinomial sampling with `do_sample=True` and `num_beams=1`.

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device

device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
# explicitly set to 100 because Llama2 generation length is 4096
outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
tokenizer.batch_decode(outputs, skip_special_tokens=True)
Expand All @@ -75,12 +79,14 @@ Enable beam search with the `num_beams` parameter (should be greater than 1 othe

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device

device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
# explicitly set to 100 because Llama2 generation length is 4096
outputs = model.generate(**inputs, max_new_tokens=50, num_beams=2)
tokenizer.batch_decode(outputs, skip_special_tokens=True)
Expand Down Expand Up @@ -160,12 +166,14 @@ Enable prompt lookup decoding with the `prompt_lookup_num_tokens` parameter.

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device

device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M", torch_dtype=torch.float16).to("cuda")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to(device)
assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M", torch_dtype=torch.float16).to(device)
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

outputs = model.generate(**inputs, assistant_model=assistant_model, max_new_tokens=20, prompt_lookup_num_tokens=5)
tokenizer.batch_decode(outputs, skip_special_tokens=True)
Expand Down Expand Up @@ -226,12 +234,14 @@ Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device

device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
# explicitly set to 100 because Llama2 generation length is 4096
outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
tokenizer.batch_decode(outputs, skip_special_tokens=True)
Expand Down Expand Up @@ -262,11 +272,13 @@ Enable DoLa with the following parameters.

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device

device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
inputs = tokenizer("What is the highest peak in the world??", return_tensors="pt").to("cuda")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to(device)
inputs = tokenizer("What is the highest peak in the world??", return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=50, dola_layers="high", do_sample=False)
tokenizer.batch_decode(outputs, skip_special_tokens=True)
Expand All @@ -280,11 +292,13 @@ Contrast layers 18 and 20 with the final layer.

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device

device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to("cuda")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to(device)
inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=50, dola_layers=[18,20], do_sample=False, repetition_penalty=1.2)
tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
Expand All @@ -302,12 +316,14 @@ Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversit

```py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device

device = infer_device()

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
# explicitly set to 100 because Llama2 generation length is 4096
outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
tokenizer.batch_decode(outputs, skip_special_tokens=True)
Expand Down
14 changes: 7 additions & 7 deletions docs/source/en/llm_tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ Tokenize your input, and set the [`~PreTrainedTokenizer.padding_side`] parameter

```py
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(model.device)
```

Pass the inputs to [`~GenerationMixin.generate`] to generate tokens, and [`~PreTrainedTokenizer.batch_decode`] the generated tokens back to text.
Expand Down Expand Up @@ -164,7 +164,7 @@ The section below covers some common issues you may encounter during text genera
[`~GenerationMixin.generate`] returns up to 20 tokens by default unless otherwise specified in a models [`GenerationConfig`]. It is highly recommended to manually set the number of generated tokens with the [`max_new_tokens`] parameter to control the output length. [Decoder-only](https://hf.co/learn/nlp-course/chapter1/6?fw=pt) models returns the initial prompt along with the generated tokens.

```py
model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to(model.device)
```

<hfoptions id="output-length">
Expand Down Expand Up @@ -195,7 +195,7 @@ The default decoding strategy in [`~GenerationMixin.generate`] is *greedy search
For example, enable a [multinomial sampling](./generation_strategies#multinomial-sampling) strategy to generate more diverse outputs. Refer to the [Generation strategy](./generation_strategies) guide for more decoding strategies.

```py
model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to(model.device)
```

<hfoptions id="decoding">
Expand Down Expand Up @@ -227,7 +227,7 @@ Inputs need to be padded if they don't have the same length. But LLMs aren't tra
```py
model_inputs = tokenizer(
["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
).to("cuda")
).to(model.device)
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
'1, 2, 33333333333'
Expand All @@ -241,7 +241,7 @@ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_s
tokenizer.pad_token = tokenizer.eos_token
model_inputs = tokenizer(
["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
).to("cuda")
).to(model.device)
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
'1, 2, 3, 4, 5, 6,'
Expand Down Expand Up @@ -270,7 +270,7 @@ model = AutoModelForCausalLM.from_pretrained(

```py
prompt = """How many cats does it take to change a light bulb? Reply as a pirate."""
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
input_length = model_inputs.input_ids.shape[1]
generated_ids = model.generate(**model_inputs, max_new_tokens=50)
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
Expand All @@ -288,7 +288,7 @@ messages = [
},
{"role": "user", "content": "How many cats does it take to change a light bulb?"},
]
model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
input_length = model_inputs.shape[1]
generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=50)
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/bert.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ model = AutoModelForMaskedLM.from_pretrained(
device_map="auto",
attn_implementation="sdpa"
)
inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to("cuda")
inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model(**inputs)
Expand Down
4 changes: 2 additions & 2 deletions docs/source/en/model_doc/camembert.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("camembert-base")
model = AutoModelForMaskedLM.from_pretrained("camembert-base", torch_dtype="auto", device_map="auto", attn_implementation="sdpa")
inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model(**inputs)
Expand Down Expand Up @@ -101,7 +101,7 @@ model = AutoModelForMaskedLM.from_pretrained(
)
tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-large")

inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model(**inputs)
Expand Down
8 changes: 4 additions & 4 deletions docs/source/en/model_doc/colqwen2.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ model_name = "vidore/colqwen2-v1.0-hf"
model = ColQwen2ForRetrieval.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto", # "cpu", "cuda", or "mps" for Apple Silicon
device_map="auto", # "cpu", "cuda", "xpu" or "mps" for Apple Silicon
attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa",
)
processor = ColQwen2Processor.from_pretrained(model_name)
Expand Down Expand Up @@ -107,10 +107,10 @@ import requests
import torch
from PIL import Image

from transformers import BitsAndBytesConfig, ColQwen2ForRetrieval, ColQwen2Processor

from transformers import BitsAndBytesConfig, ColQwen2ForRetrieval, ColQwen2Processor, infer_device

model_name = "vidore/colqwen2-v1.0-hf"
device = infer_device()

# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
Expand All @@ -123,7 +123,7 @@ bnb_config = BitsAndBytesConfig(
model = ColQwen2ForRetrieval.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="cuda",
device_map=device,
).eval()

processor = ColQwen2Processor.from_pretrained(model_name)
Expand Down
12 changes: 6 additions & 6 deletions docs/source/en/model_doc/csm.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ inputs = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
).to(model.device)

# infer the model
audio = model.generate(**inputs, output_audio=True)
Expand Down Expand Up @@ -104,7 +104,7 @@ inputs = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
).to(model.device)

# infer the model
audio = model.generate(**inputs, output_audio=True)
Expand Down Expand Up @@ -161,7 +161,7 @@ inputs = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
).to(model.device)

audio = model.generate(**inputs, output_audio=True)
processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])
Expand Down Expand Up @@ -251,7 +251,7 @@ padded_inputs_1 = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
).to(model.device)

print("\n" + "="*50)
print("First generation - compiling and recording CUDA graphs...")
Expand Down Expand Up @@ -292,7 +292,7 @@ padded_inputs_2 = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
).to(model.device)

print("\n" + "="*50)
print("Generation with other inputs!")
Expand Down Expand Up @@ -337,7 +337,7 @@ inputs = processor.apply_chat_template(
tokenize=True,
return_dict=True,
output_labels=True,
).to(device)
).to(model.device)

out = model(**inputs)
out.loss.backward()
Expand Down
4 changes: 2 additions & 2 deletions docs/source/en/model_doc/deberta-v2.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ model = AutoModelForSequenceClassification.from_pretrained(
device_map="auto"
)

inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to(model.device)
outputs = model(**inputs)

logits = outputs.logits
Expand Down Expand Up @@ -110,7 +110,7 @@ model = AutoModelForSequenceClassification.from_pretrained(
torch_dtype="float16"
)

inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to(model.device)
outputs = model(**inputs)
logits = outputs.logits
predicted_class_id = logits.argmax().item()
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/deberta.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ inputs = tokenizer(
"A soccer game with multiple people playing.",
"Some people are playing a sport.",
return_tensors="pt"
).to("cuda")
).to(model.device)

with torch.no_grad():
logits = model(**inputs).logits
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/ernie.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ model = AutoModelForMaskedLM.from_pretrained(
torch_dtype=torch.float16,
device_map="auto"
)
inputs = tokenizer("巴黎是[MASK]国的首都。", return_tensors="pt").to("cuda")
inputs = tokenizer("巴黎是[MASK]国的首都。", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model(**inputs)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/gemma3n.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ inputs = processor.apply_chat_template(
return_dict=True,
return_tensors="pt",
add_generation_prompt=True,
).to("cuda")
).to(model.device)

output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")
print(processor.decode(output[0], skip_special_tokens=True))
Expand Down
4 changes: 2 additions & 2 deletions docs/source/en/model_doc/gpt_neo.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", torch_dtype=torch.float16, device_map="auto", attn_implementation="flash_attention_2")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to(model.device)

output = model.generate(**input_ids)
print(tokenizer.decode(output[0], skip_special_tokens=True))
Expand Down Expand Up @@ -93,7 +93,7 @@ model = AutoModelForCausalLM.from_pretrained(
)

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
inputs = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
inputs = tokenizer("Hello, I'm a language model", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/gpt_neox.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ generous the support of [CoreWeave](https://www.coreweave.com/).
GPT-NeoX-20B was trained with fp16, thus it is recommended to initialize the model as follows:

```python
model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b").half().cuda()
model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", device_map="auto", torch_dtype=torch.float16)
```

GPT-NeoX-20B also has a different tokenizer from the one used in GPT-J-6B and GPT-Neo. The new tokenizer allocates
Expand Down
Loading