Skip to content

Commit 7c1169e

Browse files
authored
[4/N]more docs to device agnostic (#40355)
* more docs to device agnostic Signed-off-by: YAO Matrix <matrix.yao@intel.com> * more Signed-off-by: YAO Matrix <matrix.yao@intel.com> * 1 Signed-off-by: YAO Matrix <matrix.yao@intel.com> * 2 Signed-off-by: YAO Matrix <matrix.yao@intel.com> * Update vitpose.md * Update camembert.md * Update camembert.md --------- Signed-off-by: YAO Matrix <matrix.yao@intel.com>
1 parent 9568b50 commit 7c1169e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+185
-158
lines changed

docs/source/en/generation_strategies.md

Lines changed: 41 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,14 @@ Greedy search works well for tasks with relatively short outputs where creativit
3232

3333
```py
3434
import torch
35-
from transformers import AutoModelForCausalLM, AutoTokenizer
35+
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
36+
37+
device = infer_device()
3638

3739
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
38-
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
40+
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)
3941

40-
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
42+
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
4143
# explicitly set to default length because Llama2 generation length is 4096
4244
outputs = model.generate(**inputs, max_new_tokens=20)
4345
tokenizer.batch_decode(outputs, skip_special_tokens=True)
@@ -52,12 +54,14 @@ Enable multinomial sampling with `do_sample=True` and `num_beams=1`.
5254

5355
```py
5456
import torch
55-
from transformers import AutoModelForCausalLM, AutoTokenizer
57+
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
58+
59+
device = infer_device()
5660

5761
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
58-
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
62+
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)
5963

60-
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
64+
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
6165
# explicitly set to 100 because Llama2 generation length is 4096
6266
outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
6367
tokenizer.batch_decode(outputs, skip_special_tokens=True)
@@ -75,12 +79,14 @@ Enable beam search with the `num_beams` parameter (should be greater than 1 othe
7579

7680
```py
7781
import torch
78-
from transformers import AutoModelForCausalLM, AutoTokenizer
82+
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
83+
84+
device = infer_device()
7985

8086
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
81-
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
87+
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)
8288

83-
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
89+
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
8490
# explicitly set to 100 because Llama2 generation length is 4096
8591
outputs = model.generate(**inputs, max_new_tokens=50, num_beams=2)
8692
tokenizer.batch_decode(outputs, skip_special_tokens=True)
@@ -160,12 +166,14 @@ Enable prompt lookup decoding with the `prompt_lookup_num_tokens` parameter.
160166

161167
```py
162168
import torch
163-
from transformers import AutoModelForCausalLM, AutoTokenizer
169+
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
170+
171+
device = infer_device()
164172

165173
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
166-
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
167-
assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M", torch_dtype=torch.float16).to("cuda")
168-
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
174+
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to(device)
175+
assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M", torch_dtype=torch.float16).to(device)
176+
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)
169177

170178
outputs = model.generate(**inputs, assistant_model=assistant_model, max_new_tokens=20, prompt_lookup_num_tokens=5)
171179
tokenizer.batch_decode(outputs, skip_special_tokens=True)
@@ -226,12 +234,14 @@ Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `
226234

227235
```py
228236
import torch
229-
from transformers import AutoModelForCausalLM, AutoTokenizer
237+
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
238+
239+
device = infer_device()
230240

231241
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
232-
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
242+
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)
233243

234-
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
244+
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
235245
# explicitly set to 100 because Llama2 generation length is 4096
236246
outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
237247
tokenizer.batch_decode(outputs, skip_special_tokens=True)
@@ -262,11 +272,13 @@ Enable DoLa with the following parameters.
262272

263273
```py
264274
import torch
265-
from transformers import AutoModelForCausalLM, AutoTokenizer
275+
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
276+
277+
device = infer_device()
266278

267279
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
268-
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
269-
inputs = tokenizer("What is the highest peak in the world??", return_tensors="pt").to("cuda")
280+
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to(device)
281+
inputs = tokenizer("What is the highest peak in the world??", return_tensors="pt").to(device)
270282

271283
outputs = model.generate(**inputs, max_new_tokens=50, dola_layers="high", do_sample=False)
272284
tokenizer.batch_decode(outputs, skip_special_tokens=True)
@@ -280,11 +292,13 @@ Contrast layers 18 and 20 with the final layer.
280292

281293
```py
282294
import torch
283-
from transformers import AutoModelForCausalLM, AutoTokenizer
295+
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
296+
297+
device = infer_device()
284298

285299
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
286-
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
287-
inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to("cuda")
300+
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to(device)
301+
inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to(device)
288302

289303
outputs = model.generate(**inputs, max_new_tokens=50, dola_layers=[18,20], do_sample=False, repetition_penalty=1.2)
290304
tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
@@ -302,12 +316,14 @@ Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversit
302316

303317
```py
304318
import torch
305-
from transformers import AutoModelForCausalLM, AutoTokenizer
319+
from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
320+
321+
device = infer_device()
306322

307323
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
308-
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
324+
inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)
309325

310-
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
326+
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to(device)
311327
# explicitly set to 100 because Llama2 generation length is 4096
312328
outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
313329
tokenizer.batch_decode(outputs, skip_special_tokens=True)

docs/source/en/llm_tutorial.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ Tokenize your input, and set the [`~PreTrainedTokenizer.padding_side`] parameter
5656
5757
```py
5858
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
59-
model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
59+
model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(model.device)
6060
```
6161
6262
Pass the inputs to [`~GenerationMixin.generate`] to generate tokens, and [`~PreTrainedTokenizer.batch_decode`] the generated tokens back to text.
@@ -164,7 +164,7 @@ The section below covers some common issues you may encounter during text genera
164164
[`~GenerationMixin.generate`] returns up to 20 tokens by default unless otherwise specified in a models [`GenerationConfig`]. It is highly recommended to manually set the number of generated tokens with the [`max_new_tokens`] parameter to control the output length. [Decoder-only](https://hf.co/learn/nlp-course/chapter1/6?fw=pt) models returns the initial prompt along with the generated tokens.
165165
166166
```py
167-
model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
167+
model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to(model.device)
168168
```
169169
170170
<hfoptions id="output-length">
@@ -195,7 +195,7 @@ The default decoding strategy in [`~GenerationMixin.generate`] is *greedy search
195195
For example, enable a [multinomial sampling](./generation_strategies#multinomial-sampling) strategy to generate more diverse outputs. Refer to the [Generation strategy](./generation_strategies) guide for more decoding strategies.
196196
197197
```py
198-
model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
198+
model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to(model.device)
199199
```
200200
201201
<hfoptions id="decoding">
@@ -227,7 +227,7 @@ Inputs need to be padded if they don't have the same length. But LLMs aren't tra
227227
```py
228228
model_inputs = tokenizer(
229229
["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
230-
).to("cuda")
230+
).to(model.device)
231231
generated_ids = model.generate(**model_inputs)
232232
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
233233
'1, 2, 33333333333'
@@ -241,7 +241,7 @@ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_s
241241
tokenizer.pad_token = tokenizer.eos_token
242242
model_inputs = tokenizer(
243243
["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
244-
).to("cuda")
244+
).to(model.device)
245245
generated_ids = model.generate(**model_inputs)
246246
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
247247
'1, 2, 3, 4, 5, 6,'
@@ -270,7 +270,7 @@ model = AutoModelForCausalLM.from_pretrained(
270270
271271
```py
272272
prompt = """How many cats does it take to change a light bulb? Reply as a pirate."""
273-
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
273+
model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
274274
input_length = model_inputs.input_ids.shape[1]
275275
generated_ids = model.generate(**model_inputs, max_new_tokens=50)
276276
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
@@ -288,7 +288,7 @@ messages = [
288288
},
289289
{"role": "user", "content": "How many cats does it take to change a light bulb?"},
290290
]
291-
model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
291+
model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
292292
input_length = model_inputs.shape[1]
293293
generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=50)
294294
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])

docs/source/en/model_doc/bert.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ model = AutoModelForMaskedLM.from_pretrained(
6565
device_map="auto",
6666
attn_implementation="sdpa"
6767
)
68-
inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to("cuda")
68+
inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to(model.device)
6969

7070
with torch.no_grad():
7171
outputs = model(**inputs)

docs/source/en/model_doc/camembert.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ from transformers import AutoTokenizer, AutoModelForMaskedLM
6060

6161
tokenizer = AutoTokenizer.from_pretrained("camembert-base")
6262
model = AutoModelForMaskedLM.from_pretrained("camembert-base", torch_dtype="auto", device_map="auto", attn_implementation="sdpa")
63-
inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
63+
inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to(model.device)
6464

6565
with torch.no_grad():
6666
outputs = model(**inputs)
@@ -101,7 +101,7 @@ model = AutoModelForMaskedLM.from_pretrained(
101101
)
102102
tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-large")
103103

104-
inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
104+
inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to(model.device)
105105

106106
with torch.no_grad():
107107
outputs = model(**inputs)

docs/source/en/model_doc/colqwen2.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ model_name = "vidore/colqwen2-v1.0-hf"
5050
model = ColQwen2ForRetrieval.from_pretrained(
5151
model_name,
5252
torch_dtype=torch.bfloat16,
53-
device_map="auto", # "cpu", "cuda", or "mps" for Apple Silicon
53+
device_map="auto", # "cpu", "cuda", "xpu" or "mps" for Apple Silicon
5454
attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa",
5555
)
5656
processor = ColQwen2Processor.from_pretrained(model_name)
@@ -107,10 +107,10 @@ import requests
107107
import torch
108108
from PIL import Image
109109

110-
from transformers import BitsAndBytesConfig, ColQwen2ForRetrieval, ColQwen2Processor
111-
110+
from transformers import BitsAndBytesConfig, ColQwen2ForRetrieval, ColQwen2Processor, infer_device
112111

113112
model_name = "vidore/colqwen2-v1.0-hf"
113+
device = infer_device()
114114

115115
# 4-bit quantization configuration
116116
bnb_config = BitsAndBytesConfig(
@@ -123,7 +123,7 @@ bnb_config = BitsAndBytesConfig(
123123
model = ColQwen2ForRetrieval.from_pretrained(
124124
model_name,
125125
quantization_config=bnb_config,
126-
device_map="cuda",
126+
device_map=device,
127127
).eval()
128128

129129
processor = ColQwen2Processor.from_pretrained(model_name)

docs/source/en/model_doc/csm.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ inputs = processor.apply_chat_template(
5959
conversation,
6060
tokenize=True,
6161
return_dict=True,
62-
).to(device)
62+
).to(model.device)
6363

6464
# infer the model
6565
audio = model.generate(**inputs, output_audio=True)
@@ -104,7 +104,7 @@ inputs = processor.apply_chat_template(
104104
conversation,
105105
tokenize=True,
106106
return_dict=True,
107-
).to(device)
107+
).to(model.device)
108108

109109
# infer the model
110110
audio = model.generate(**inputs, output_audio=True)
@@ -161,7 +161,7 @@ inputs = processor.apply_chat_template(
161161
conversation,
162162
tokenize=True,
163163
return_dict=True,
164-
).to(device)
164+
).to(model.device)
165165

166166
audio = model.generate(**inputs, output_audio=True)
167167
processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])
@@ -251,7 +251,7 @@ padded_inputs_1 = processor.apply_chat_template(
251251
conversation,
252252
tokenize=True,
253253
return_dict=True,
254-
).to(device)
254+
).to(model.device)
255255

256256
print("\n" + "="*50)
257257
print("First generation - compiling and recording CUDA graphs...")
@@ -292,7 +292,7 @@ padded_inputs_2 = processor.apply_chat_template(
292292
conversation,
293293
tokenize=True,
294294
return_dict=True,
295-
).to(device)
295+
).to(model.device)
296296

297297
print("\n" + "="*50)
298298
print("Generation with other inputs!")
@@ -337,7 +337,7 @@ inputs = processor.apply_chat_template(
337337
tokenize=True,
338338
return_dict=True,
339339
output_labels=True,
340-
).to(device)
340+
).to(model.device)
341341

342342
out = model(**inputs)
343343
out.loss.backward()

docs/source/en/model_doc/deberta-v2.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ model = AutoModelForSequenceClassification.from_pretrained(
6969
device_map="auto"
7070
)
7171

72-
inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
72+
inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to(model.device)
7373
outputs = model(**inputs)
7474

7575
logits = outputs.logits
@@ -110,7 +110,7 @@ model = AutoModelForSequenceClassification.from_pretrained(
110110
torch_dtype="float16"
111111
)
112112

113-
inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
113+
inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to(model.device)
114114
outputs = model(**inputs)
115115
logits = outputs.logits
116116
predicted_class_id = logits.argmax().item()

docs/source/en/model_doc/deberta.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ inputs = tokenizer(
7171
"A soccer game with multiple people playing.",
7272
"Some people are playing a sport.",
7373
return_tensors="pt"
74-
).to("cuda")
74+
).to(model.device)
7575

7676
with torch.no_grad():
7777
logits = model(**inputs).logits

docs/source/en/model_doc/ernie.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ model = AutoModelForMaskedLM.from_pretrained(
6666
torch_dtype=torch.float16,
6767
device_map="auto"
6868
)
69-
inputs = tokenizer("巴黎是[MASK]国的首都。", return_tensors="pt").to("cuda")
69+
inputs = tokenizer("巴黎是[MASK]国的首都。", return_tensors="pt").to(model.device)
7070

7171
with torch.no_grad():
7272
outputs = model(**inputs)

docs/source/en/model_doc/gemma3n.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ inputs = processor.apply_chat_template(
103103
return_dict=True,
104104
return_tensors="pt",
105105
add_generation_prompt=True,
106-
).to("cuda")
106+
).to(model.device)
107107

108108
output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")
109109
print(processor.decode(output[0], skip_special_tokens=True))

0 commit comments

Comments
 (0)